diff --git a/1001-config-add-syslog-ng-and-logrotate-config.patch b/1001-config-add-syslog-ng-and-logrotate-config.patch new file mode 100644 index 0000000000000000000000000000000000000000..b540055656d82164142d04775163ec55859d4f8c --- /dev/null +++ b/1001-config-add-syslog-ng-and-logrotate-config.patch @@ -0,0 +1,203 @@ +From 6949adcd0e7595000b882d57ebc7e3f47c40508e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 16:24:56 +0800 +Subject: [PATCH 01/30] config: add syslog-ng and logrotate config + +redirect all rasdaemon log to /var/log/rasdaemon and config logrotate, +add related modification in rasdaemon.spec + +The patch does not directly add a dependency on the syslog-ng package +to the rasdaemon RPM. Instead, it dynamically checks whether the +syslog-ng service is running during installation and configures accordingly. + +Signed-off-by: Bing Wu +Signed-off-by: Ruidong Tian +--- + Makefile.am | 31 +++++++++++++++++++++----- + man/rasdaemon.1.in | 3 ++- + misc/rasdaemon.logrotate.in | 14 ++++++++++++ + misc/rasdaemon.spec.in | 43 +++++++++++++++++++++++++++++++++---- + misc/rasdaemon.syslog-ng.in | 7 ++++++ + 6 files changed, 90 insertions(+), 10 deletions(-) + create mode 100644 misc/rasdaemon.logrotate.in + create mode 100644 misc/rasdaemon.syslog-ng.in + +diff --git a/Makefile.am b/Makefile.am +index 01132fe..a1f6edf 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -5,27 +5,42 @@ ACLOCAL_AMFLAGS=-I m4 + SUBDIRS = util man + SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in + SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) ++SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in ++SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng) ++LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in ++LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) + EXTRA_DIST = \ +- $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env \ ++ $(SYSTEMD_SERVICES_IN) \ ++ $(SYSLOG_SERVICES_IN) \ ++ $(LOGROTATE_SERVICES_IN) \ ++ misc/rasdaemon.env \ + contrib/mc_event_trigger \ + contrib/mem_fail_trigger + + CLEANFILES= \ + misc/ras-mc-ctl.service \ +- misc/rasdaemon.service ++ misc/rasdaemon.service \ ++ misc/rasdaemon.syslog-ng \ ++ misc/rasdaemon.logrotate + + DISTCLEANFILES = misc/rasdaemon.spec + + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + ++.logrotate.in.logrotate: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ ++.syslog-ng.in.syslog-ng: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -128,6 +143,12 @@ upload: + install-data-local: + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" +- $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" + $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger" + $(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger" ++ if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ ++ fi ++ if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ ++ fi +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index 7cfef54..e884e55 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -34,7 +34,8 @@ rasdaemon \- RAS daemon to log the RAS events. + The \fBrasdaemon\fR program is a daemon which monitors the platform + Reliablity, Availability and Serviceability (RAS) reports from the + Linux kernel trace events. These trace events are logged in +-/sys/kernel/debug/tracing, reporting them via syslog/journald. ++/sys/kernel/debug/tracing, reporting them via syslog/journald. If ++syslog-ng is installed, the events will logged at @localstatedir@/log/rasdaemon. + + .SH OPTIONS + .TP +diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in +new file mode 100644 +index 0000000..b7b62fe +--- /dev/null ++++ b/misc/rasdaemon.logrotate.in +@@ -0,0 +1,14 @@ ++@localstatedir@/log/rasdaemon { ++ compress ++ monthly ++ size 100M ++ dateext ++ rotate 4 ++ notifempty ++ missingok ++ copytruncate ++ sharedscripts ++ postrotate ++ @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true ++ endscript ++} +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 32c69b7..8ab3d50 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -49,20 +49,55 @@ make %{?_smp_mflags} + + %install + make install DESTDIR=%{buildroot} +-install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service ++install -D -p -m 0644 misc/%{name}.service %{buildroot}%{_unitdir}/%{name}.service + install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service +-install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} ++install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} ++install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng ++install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate + rm INSTALL %{buildroot}/usr/include/*.h + + %files +-%doc AUTHORS ChangeLog COPYING README.md TODO +-%{_sbindir}/rasdaemon ++%doc AUTHORS ChangeLog COPYING TODO ++%{_sbindir}/%{name} + %{_sbindir}/ras-mc-ctl + %{_mandir}/*/* + %{_unitdir}/*.service + %{_sysconfdir}/ras/dimm_labels.d + %{_sysconfdir}/ras/*/* + %config(noreplace) %{_sysconfdir}/sysconfig/%{name} ++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng ++%config(noreplace) /usr/share/%{name}/%{name}.logrotate ++ ++%post ++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog service is enabled and running, create config file and restart it"; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ systemctl restart syslog-ng.service; ++fi ++if [ -d "%{_sysconfdir}/logrotate.d" ]; then ++ rm -rf %{_sysconfdir}/logrotate.d/%{name}; ++ ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name}; ++fi ++if ! systemctl is-enabled --quiet %{name}.service; then ++ echo "Rasdaemon service is not enabled, enable it"; ++ systemctl enable %{name}.service; ++fi ++systemctl restart %{name}.service ++ ++%preun ++systemctl stop %{name}.service ++systemctl disable %{name}.service ++ ++%postun ++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog service is enabled and running, delete config file and restart it"; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ systemctl restart syslog-ng.service; ++fi ++if [ -d "%{_sysconfdir}/logrotate.d" ]; then ++ rm -rf %{_sysconfdir}/logrotate.d/%{name}; ++fi + + %changelog + +diff --git a/misc/rasdaemon.syslog-ng.in b/misc/rasdaemon.syslog-ng.in +new file mode 100644 +index 0000000..b3308f8 +--- /dev/null ++++ b/misc/rasdaemon.syslog-ng.in +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++destination d_rasdaemon { file("@localstatedir@/log/rasdaemon" persist-name(rasdaemon-syslog)); }; ++ ++filter f_rasdaemon { program("rasdaemon"); }; ++ ++log { source(s_sys); filter(f_rasdaemon); destination(d_rasdaemon); }; +-- +2.43.5 + diff --git a/1002-config-add-rsyslog-config.patch b/1002-config-add-rsyslog-config.patch new file mode 100644 index 0000000000000000000000000000000000000000..8ad5777c21a1c2b7436c8e7ff6357027c9d7fddf --- /dev/null +++ b/1002-config-add-rsyslog-config.patch @@ -0,0 +1,160 @@ +From f1f27e8f90a0be341e367a40962f6f7103504659 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Apr 2025 11:18:02 +0800 +Subject: [PATCH 02/30] config: add rsyslog config + +redirect all rasdaemon log to /var/log/rasdaemon, +add related modification in rasdaemon.spec + +The patch does not directly add a dependency on the rsyslog package +to the rasdaemon RPM. Instead, it dynamically checks whether the +rsyslog service is running during installation and configures accordingly. + +Signed-off-by: Bing Wu +Signed-off-by: Ruidong Tian +--- + Makefile.am | 14 ++++++++++++-- + misc/rasdaemon.logrotate.in | 3 ++- + misc/rasdaemon.rsyslog.in | 3 +++ + misc/rasdaemon.spec.in | 19 ++++++++++++++++--- + 5 files changed, 34 insertions(+), 6 deletions(-) + create mode 100644 misc/rasdaemon.rsyslog.in + +diff --git a/Makefile.am b/Makefile.am +index a1f6edf..e3e66bb 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -9,9 +9,12 @@ SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in + SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng) + LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in + LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) ++RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in ++RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog) + EXTRA_DIST = \ + $(SYSTEMD_SERVICES_IN) \ + $(SYSLOG_SERVICES_IN) \ ++ $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/mc_event_trigger \ +@@ -21,6 +24,7 @@ CLEANFILES= \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ ++ misc/rasdaemon.rsyslog \ + misc/rasdaemon.logrotate + + DISTCLEANFILES = misc/rasdaemon.spec +@@ -28,7 +32,7 @@ DISTCLEANFILES = misc/rasdaemon.spec + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + +@@ -38,9 +42,12 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n + .syslog-ng.in.syslog-ng: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ + ++.rsyslog.in.rsyslog: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -149,6 +156,9 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ + fi ++ if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.conf"; \ ++ fi + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi +diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in +index b7b62fe..ca188ba 100644 +--- a/misc/rasdaemon.logrotate.in ++++ b/misc/rasdaemon.logrotate.in +@@ -9,6 +9,7 @@ + copytruncate + sharedscripts + postrotate +- @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true ++ (@sbindir@/systemctl is-active --quiet syslog-ng.service && @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1) || true ++ (@sbindir@/systemctl is-active --quiet rsyslog.service &&@sbindir@/systemctl kill -s HUP rsyslog.service >/dev/null 2>&1) || true + endscript + } +diff --git a/misc/rasdaemon.rsyslog.in b/misc/rasdaemon.rsyslog.in +new file mode 100644 +index 0000000..d1a5cf1 +--- /dev/null ++++ b/misc/rasdaemon.rsyslog.in +@@ -0,0 +1,3 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++:programname, isequal, "rasdaemon" @localstatedir@/log/rasdaemon +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 8ab3d50..4cc859f 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -54,6 +54,7 @@ install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl + install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} + install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng + install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate ++install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -67,14 +68,21 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) %{_sysconfdir}/sysconfig/%{name} + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng + %config(noreplace) /usr/share/%{name}/%{name}.logrotate ++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog + + %post +-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + systemctl restart syslog-ng.service; + fi ++if systemctl is-active --quiet rsyslog.service; then ++ echo "Rsyslog service is enabled and running, create config file and restart it"; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ systemctl restart rsyslog.service; ++fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; + ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name}; +@@ -90,11 +98,16 @@ systemctl stop %{name}.service + systemctl disable %{name}.service + + %postun +-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then +- echo "Syslog service is enabled and running, delete config file and restart it"; ++if systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog-ng service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + systemctl restart syslog-ng.service; + fi ++if systemctl is-active --quiet rsyslog.service; then ++ echo "Rsyslog service is enabled and running, delete config file and restart it"; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ systemctl restart rsyslog.service; ++fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; + fi +-- +2.43.5 + diff --git a/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..1474b7d2c9bd9c96be286d64cb20fac267baa4bf --- /dev/null +++ b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch @@ -0,0 +1,734 @@ +From e14173ad86ac94b9e4af84eaddb1abe3bc6410b7 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 15:25:09 +0800 +Subject: [PATCH] rasdaemon: trace SIGBUS event for hardware error + +Kernel will send SIGBUS to program when read DE/UE, use rasdaemon to +catch this SIGBUS and print it like follow: + <...>-71085 [056] d... 0.007781 signal_generate \ + 2025-03-18 15:24:11 +0800 signal: Bus error, errorno: 0, code: 4, \ + comm: einj_mem_uc, pid: 71085, grp: 0, res: Deliverd, \ + msg: Hardware memory error consumed: action required + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 5 +- + configure.ac | 11 ++++ + ras-events.c | 27 +++++++- + ras-events.h | 1 + + ras-record.c | 75 +++++++++++++++++++++++ + ras-record.h | 20 ++++++ + ras-report.c | 82 +++++++++++++++++++++++++ + ras-report.h | 6 +- + ras-signal-handler.c | 143 +++++++++++++++++++++++++++++++++++++++++++ + ras-signal-handler.h | 30 +++++++++ + util/ras-mc-ctl.in | 42 ++++++++++++- + 11 files changed, 438 insertions(+), 4 deletions(-) + create mode 100644 ras-signal-handler.c + create mode 100644 ras-signal-handler.h + +diff --git a/Makefile.am b/Makefile.am +index e3e66bb..1306d97 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -112,6 +112,9 @@ endif + if WITH_JAGUAR_NS_DECODE + rasdaemon_SOURCES += non-standard-jaguarmicro.c + endif ++if WITH_SIGNAL ++ rasdaemon_SOURCES += ras-signal-handler.c ++endif + + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) +@@ -122,7 +125,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ +- non-standard-jaguarmicro.h trigger.h unified-sel.h ++ non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 1cb00b6..25e0cb2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -244,6 +244,16 @@ AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes + AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) + ++AC_ARG_ENABLE([signal], ++ AS_HELP_STRING([--enable-signal], [enable signal event(currently experimental)])) ++ ++AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_SIGNAL,1,"have signal event") ++ AC_SUBST([WITH_SIGNAL]) ++]) ++AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -290,4 +300,5 @@ compile time options summary + CPU fault isolation : $USE_CPU_FAULT_ISOLATION + YITIAN RAS errors : $USE_YITIAN_NS_DECODE + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE ++ Signal : $USE_SIGNAL + EOF +diff --git a/ras-events.c b/ras-events.c +index 6692a31..2220e9a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -34,6 +34,7 @@ + #include "ras-memory-failure-handler.h" + #include "ras-non-standard-handler.h" + #include "ras-page-isolation.h" ++#include "ras-signal-handler.h" + #include "ras-record.h" + #include "trigger.h" + +@@ -315,6 +316,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable); + #endif + ++#ifdef HAVE_SIGNAL ++ rc |= __toggle_ras_mc_event(ras, "signal", "signal_generate", enable); ++#endif ++ + free_ras: + free(ras); + if (rc) +@@ -335,7 +340,7 @@ static void setup_event_trigger(char *event) + } + + #ifdef HAVE_DISKERROR +-#ifndef HAVE_BLK_RQ_ERROR ++#if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL) + /* + * Set kernel filter. libtrace doesn't provide an API for setting filters + * in kernel, we have to implement it here. +@@ -943,6 +948,10 @@ int handle_ras_events(int record_events, int enable_ipmitool) + #ifdef HAVE_DEVLINK + char *filter_str = NULL; + #endif ++#ifdef HAVE_SIGNAL ++ char signal_filter[64]; ++#endif ++ + + ras = calloc(1, sizeof(*ras)); + if (!ras) { +@@ -1173,6 +1182,22 @@ int handle_ras_events(int record_events, int enable_ipmitool) + "cxl", "memory_module"); + #endif + ++#ifdef HAVE_SIGNAL ++ snprintf(signal_filter, sizeof(signal_filter), "sig == %d && code >= %d", SIGBUS, BUS_OBJERR); ++ // ensure filter enabled ++ usleep(30000); ++ rc = filter_ras_mc_event(ras, "signal", "signal_generate", signal_filter); ++ if (!rc) { ++ rc = add_event_handler(ras, pevent, page_size, "signal", "signal_generate", ++ ras_signal_event_handler, NULL, SIGNAL_EVENT); ++ if (!rc) ++ num_events++; ++ else if (rc != -EINVAL) ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "signal", "signal_generate"); ++ } ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace any supported RAS events. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 83d41df..1689a12 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -35,6 +35,7 @@ enum { + CXL_GENERAL_MEDIA_EVENT, + CXL_DRAM_EVENT, + CXL_MEMORY_MODULE_EVENT, ++ SIGNAL_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index eed7aca..31a93a4 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -1142,6 +1142,61 @@ int ras_store_cxl_memory_module_event(struct ras_events *ras, + } + #endif + ++#ifdef HAVE_SIGNAL ++static const struct db_fields signal_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "sig", .type = "INTEGER" }, ++ { .name = "errorno", .type = "INTEGER" }, ++ { .name = "code", .type = "INTEGER" }, ++ { .name = "comm", .type = "TEXT" }, ++ { .name = "pid", .type = "INTEGER" }, ++ { .name = "grp", .type = "INTEGER" }, ++ { .name = "res", .type = "INTEGER" }, ++ ++}; ++ ++static const struct db_table_descriptor signal_event_tab = { ++ .name = "signal_event", ++ .fields = signal_event_fields, ++ .num_fields = ARRAY_SIZE(signal_event_fields), ++}; ++ ++int ras_store_signal_event(struct ras_events *ras, struct ras_signal_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_signal_event) ++ return -1; ++ log(TERM, LOG_INFO, "signal_event store: %p\n", priv->stmt_signal_event); ++ ++ sqlite3_bind_text(priv->stmt_signal_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int(priv->stmt_signal_event, 2, ev->sig); ++ sqlite3_bind_int(priv->stmt_signal_event, 3, ev->error_no); ++ sqlite3_bind_int(priv->stmt_signal_event, 4, ev->code); ++ sqlite3_bind_text(priv->stmt_signal_event, 5, ev->comm, -1, NULL); ++ sqlite3_bind_int(priv->stmt_signal_event, 6, ev->pid); ++ sqlite3_bind_int(priv->stmt_signal_event, 7, ev->group); ++ sqlite3_bind_int(priv->stmt_signal_event, 8, ev->result); ++ ++ rc = sqlite3_step(priv->stmt_signal_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do signal_event step on sqlite: error = %d\n", rc); ++ ++ rc = sqlite3_reset(priv->stmt_signal_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset signal_event on sqlite: error = %d\n", ++ rc); ++ ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + /* + * Generic code + */ +@@ -1550,6 +1605,16 @@ int ras_mc_event_opendb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_SIGNAL ++ rc = ras_mc_create_table(priv, &signal_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_signal_event, ++ &signal_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } ++#endif ++ + ras->db_priv = priv; + return 0; + +@@ -1734,6 +1799,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_SIGNAL ++ if (priv->stmt_signal_event) { ++ rc = sqlite3_finalize(priv->stmt_signal_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize signal_event sqlite: error = %d\n", ++ cpu, rc); ++ } ++#endif ++ + rc = sqlite3_close_v2(db); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, +diff --git a/ras-record.h b/ras-record.h +index eec0702..2dd6630 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -9,6 +9,7 @@ + #define __RAS_RECORD_H + + #include ++#include + #include + #include + +@@ -258,6 +259,17 @@ struct ras_cxl_memory_module_event { + uint8_t res_id[CXL_PLDM_RES_ID_LEN]; + }; + ++struct ras_signal_event { ++ char timestamp[64]; ++ int sig; ++ int error_no; ++ int code; ++ char *comm; ++ pid_t pid; ++ int group; ++ int result; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -275,6 +287,7 @@ struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; + struct ras_cxl_dram_event; + struct ras_cxl_memory_module_event; ++struct ras_signal_event; + + #ifdef HAVE_SQLITE3 + +@@ -315,6 +328,9 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_dram_event; + sqlite3_stmt *stmt_cxl_memory_module_event; + #endif ++#ifdef HAVE_SIGNAL ++ sqlite3_stmt *stmt_signal_event; ++#endif + }; + + struct db_fields { +@@ -361,6 +377,8 @@ int ras_store_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev); + int ras_store_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev); ++int ras_store_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned int cpu, +@@ -401,6 +419,8 @@ static inline int ras_store_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev) { return 0; }; + static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev) { return 0; }; ++static inline int ras_store_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 4535421..35d2792 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -13,6 +13,7 @@ + #include + + #include "ras-report.h" ++#include "ras-record.h" + + static int setup_report_socket(void) + { +@@ -735,6 +736,37 @@ static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memor + return 0; + } + ++static int set_signal_event_backtrace(char *buf, struct ras_signal_event *ev) ++{ ++ unsigned int size = MAX_BACKTRACE_SIZE; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ while (*buf && size > 0) { ++ buf++; ++ size--; ++ } ++ ++ snprintf(buf, size, "BACKTRACE=" ++ "timestamp=%s\n" ++ "signal=%d\n" ++ "errorno=%d\n" ++ "code=%d\n" ++ "comm=%s\n" ++ "grp=%d\n" ++ "res=%d\n", ++ ev->timestamp, ++ ev->sig, ++ ev->error_no, ++ ev->code, ++ ev->comm, ++ ev->group, ++ ev->result); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev) + { + char buf[MAX_BACKTRACE_SIZE]; +@@ -812,6 +844,10 @@ static int commit_report_backtrace(int sockfd, int type, void *ev) + rc = set_cxl_memory_module_event_backtrace(buf, + (struct ras_cxl_memory_module_event *)ev); + break; ++ case SIGNAL_EVENT: ++ rc = set_signal_event_backtrace(buf, ++ (struct ras_signal_event *)ev); ++ break; + default: + return -1; + } +@@ -1552,3 +1588,49 @@ cxl_memory_module_fail: + + return -1; + } ++ ++int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto signal_fail; ++ ++ rc = commit_report_backtrace(sockfd, SIGNAL_EVENT, ev); ++ if (rc < 0) ++ goto signal_fail; ++ ++ snprintf(buf, MAX_MESSAGE_SIZE, "ANALYZER=%s", ++ "rasdaemon-signal_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto signal_fail; ++ ++ snprintf(buf, MAX_MESSAGE_SIZE, "REASON=%s", "SIGBUS for Hardware error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto signal_fail; ++ ++ done = 1; ++ ++signal_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index ceb64ce..f680a25 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -57,6 +57,8 @@ int ras_report_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev); + int ras_report_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev); ++int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev); + + #else + +@@ -108,7 +110,9 @@ static inline int ras_report_cxl_dram_event(struct ras_events *ras, + static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev) + { return 0; }; +- ++static inline int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) ++{ return 0; }; + #endif + + #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +new file mode 100644 +index 0000000..fb0bfd3 +--- /dev/null ++++ b/ras-signal-handler.c +@@ -0,0 +1,143 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2025 Ruidong Tian ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++ ++#include "ras-signal-handler.h" ++#include "ras-report.h" ++#include "types.h" ++ ++enum { ++ TRACE_SIGNAL_DELIVERED, ++ TRACE_SIGNAL_IGNORED, ++ TRACE_SIGNAL_ALREADY_PENDING, ++ TRACE_SIGNAL_OVERFLOW_FAIL, ++ TRACE_SIGNAL_LOSE_INFO, ++}; ++ ++static char *signal_msg[] = { ++ [BUS_ADRALN] = "invalid address alignment", ++ [BUS_ADRERR] = "non-existent address", ++ [BUS_OBJERR] = "object-specific hardware error", ++ [BUS_MCEERR_AR] = "Hardware memory error consumed: action required", ++ [BUS_MCEERR_AO] = "Hardware memory error detected in process but not consumed: action optional", ++}; ++ ++static char *errcode_str[] = { ++ [BUS_ADRALN] = "BUS_ADRALN", ++ [BUS_ADRERR] = "BUS_ADRERR", ++ [BUS_OBJERR] = "BUS_OBJERR", ++ [BUS_MCEERR_AR] = "BUS_MCEERR_AR", ++ [BUS_MCEERR_AO] = "BUS_MCEERR_AO", ++}; ++ ++static char *signal_res[] = { ++ [TRACE_SIGNAL_DELIVERED] = "Delivered", ++ [TRACE_SIGNAL_IGNORED] = "Ignore", ++ [TRACE_SIGNAL_ALREADY_PENDING] = "Already pending", ++ [TRACE_SIGNAL_OVERFLOW_FAIL] = "Overflow fail", ++ [TRACE_SIGNAL_LOSE_INFO] = "Lose info", ++}; ++ ++static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev) ++{ ++ trace_seq_printf(s, ++ "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", ++ ev->timestamp, strsignal(ev->sig), ev->error_no, ++ (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code], ++ ev->comm, ev->pid, ++ ev->group, ++ (ev->result < 0 || ev->result > TRACE_SIGNAL_LOSE_INFO) ? "Unknown" : signal_res[ev->result], ++ ev->sig == SIGBUS ? signal_msg[ev->code] : "Unknown"); ++} ++ ++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_signal_event ev; ++ ++ /* ++ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. ++ * On previous kernels, the way to properly generate an event would ++ * be to inject a fake one, measure its timestamp and diff it against ++ * gettimeofday. We won't do it here. Instead, let's use uptime, ++ * falling-back to the event report's time, if "uptime" clock is ++ * not available (legacy kernels). ++ */ ++ ++ if (ras->use_uptime) ++ now = record->ts / user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ if (tep_get_field_val(s, event, "sig", record, &val, 1) < 0) ++ return -1; ++ ev.sig = val; ++ ++ if (tep_get_field_val(s, event, "errno", record, &val, 1) < 0) ++ return -1; ++ ev.error_no = val; ++ ++ if (tep_get_field_val(s, event, "code", record, &val, 1) < 0) ++ return -1; ++ ev.code = val; ++ ++ ev.comm = tep_get_field_raw(s, event, "comm", record, &len, 1); ++ if (!ev.comm) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "pid", record, &val, 1) < 0) ++ return -1; ++ ev.pid = val; ++ ++ if (tep_get_field_val(s, event, "group", record, &val, 1) < 0) ++ return -1; ++ ev.group = val; ++ ++ if (tep_get_field_val(s, event, "result", record, &val, 1) < 0) ++ return -1; ++ ev.result = val; ++ ++ report_ras_signal_event(s, &ev); ++ ++ /* Store data into the SQLite DB */ ++#ifdef HAVE_SQLITE3 ++ ras_store_signal_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_signal_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-signal-handler.h b/ras-signal-handler.h +new file mode 100644 +index 0000000..9740c61 +--- /dev/null ++++ b/ras-signal-handler.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2025 Ruidong Tian ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef __RAS_SIGNAL_HANDLER_H ++#define __RAS_SIGNAL_HANDLER_H ++ ++#include ++ ++#include "ras-events.h" ++ ++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, ++ struct tep_event *event, void *context); ++ ++#endif +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index ba48660..648517f 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -35,6 +35,7 @@ my $has_disk_errors = 0; + my $has_extlog = 0; + my $has_mem_failure = 0; + my $has_mce = 0; ++my $has_signal = 0; + + @WITH_AER_TRUE@$has_aer = 1; + @WITH_ARM_TRUE@$has_arm = 1; +@@ -44,6 +45,7 @@ my $has_mce = 0; + @WITH_EXTLOG_TRUE@$has_extlog = 1; + @WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1; + @WITH_MCE_TRUE@$has_mce = 1; ++@WITH_SIGNAL_TRUE@$has_signal = 1; + + my %conf = (); + my %bus = (); +@@ -1546,7 +1548,7 @@ sub summary + { + require DBI; + my ($query, $query_handle, $out); +- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); ++ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result, $sigcode); + my ($etype, $severity, $etype_string, $severity_string); + my ($dev_name, $dev); + my ($mpidr, $memdev); +@@ -1828,6 +1830,24 @@ sub summary + $query_handle->finish; + } + ++ # Signal event ++ if ($has_signal == 1) { ++ $query = "select code, count(*) from signal_event$conf{opt}{since} group by code"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($sigcode, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$sigcode errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "SIGNAL events summary:\n$out\n"; ++ } else { ++ print "No SIGNAL.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +@@ -1849,6 +1869,7 @@ sub errors + my ($nibble_mask, $bank_group, $row, $column, $cor_mask); + my ($event_type, $event_sub_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status); + my ($sub_type, $sub_channel, $cme_threshold_ev_flags, $cme_count, $cvme_count); ++ my ($signal, $errorno, $code, $comm, $pid, $grp, $res); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -2366,6 +2387,25 @@ sub errors + $query_handle->finish; + } + ++ # SIGNAL event ++ if ($has_signal == 1) { ++ $query = "select id, timestamp, signal, errorno, code, comm, pid, grp, res from signal_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $signal, $errorno, $code, $comm, $pid, $grp, $res)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "signal=$signal, errorno=$errorno, code=$code, comm=$comm, pid=$pid, grp=$grp, res=$res\n"; ++ } ++ if ($out ne "") { ++ print "SIGNAL events:\n$out\n"; ++ } else { ++ print "No SIGNAL event.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +-- +2.43.5 + diff --git a/1004-rasdaemon-align-event-name-in-log.patch b/1004-rasdaemon-align-event-name-in-log.patch new file mode 100644 index 0000000000000000000000000000000000000000..37f94a48c5ffd4e78cbd48032b64713ef071ec5c --- /dev/null +++ b/1004-rasdaemon-align-event-name-in-log.patch @@ -0,0 +1,34 @@ +From 86a6cbb904a50269c901ba2ed591fde7debfa298 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 15:52:41 +0800 +Subject: [PATCH 04/30] rasdaemon: align event name in log + +Now rasdaemon event name is not align in log: + + <...>-52503 [070] dNh. 0.007127 arm_event ... + <...>-52503 [052] .... 0.007127 memory_failure_event ... +Align it and result look like: + <...>-113714 [059] dNh. 0.007942 arm_event: ... + <...>-113714 [069] .... 0.007942 memory_failure_event: ... + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ras-events.c b/ras-events.c +index 2220e9a..88c8a5f 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -418,7 +418,7 @@ static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf, + /* TODO - logging */ + trace_seq_init(&s); + tep_print_event(pdata->ras->pevent, &s, &record, +- "%16s-%-5d [%03d] %s %6.1000d %s %s", ++ "%16s-%-10d [%03d] %s %6.1000d %25s: %s", + TEP_PRINT_COMM, TEP_PRINT_PID, TEP_PRINT_CPU, + TEP_PRINT_LATENCY, TEP_PRINT_TIME, TEP_PRINT_NAME, + TEP_PRINT_INFO); +-- +2.43.5 + diff --git a/1005-rasdaemon-skip-doesn-t-exist-event.patch b/1005-rasdaemon-skip-doesn-t-exist-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..1f6cbd2a611556a1f826bffbff65a0896af068f9 --- /dev/null +++ b/1005-rasdaemon-skip-doesn-t-exist-event.patch @@ -0,0 +1,56 @@ +From 7a13978040e6aa3e841cbbd5e6f91e5f98ae8d82 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 25 Mar 2025 10:16:13 +0800 +Subject: [PATCH 05/30] rasdaemon: skip doesn't exist event + +When compiling rasdaemon with the --enable-all configuration flag, +the system may detect unsupported hardware events - for instance, +ARM-specific events on x86 architectures. This causes the program +to enter a busy-wait loop in the wait_access function. A better +approach would be to explicitly skip these architecture-mismatched +events during initialization. + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/ras-events.c b/ras-events.c +index 88c8a5f..d42ed9f 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -826,6 +826,18 @@ static int select_tracing_timestamp(struct ras_events *ras) + return 0; + } + ++static bool check_event_exist(struct ras_events *ras, char *group, char *event) ++{ ++ char fname[MAX_PATH + 256]; ++ ++ snprintf(fname, sizeof(fname), "%s/tracing/events/%s/%s", ++ ras->debugfs, group, event); ++ if (access(fname, F_OK) == 0) ++ return true; ++ ++ return false; ++} ++ + #define EVENT_DISABLED 1 + + static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, +@@ -837,6 +849,12 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, + char *page, fname[MAX_PATH + 1]; + struct tep_event_filter *filter = NULL; + ++ if (!check_event_exist(ras, group, event)) { ++ log(ALL, LOG_WARNING, "%s:%s event not exist\n", ++ group, event); ++ return -EINVAL; ++ } ++ + snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); + + fd = open_trace(ras, fname, O_RDONLY); +-- +2.43.5 + diff --git a/1006-rasdaemon-support-memory-corrected-error-statistics.patch b/1006-rasdaemon-support-memory-corrected-error-statistics.patch new file mode 100644 index 0000000000000000000000000000000000000000..845a3600e10148471f43eae1d8d2bce288cf1cb3 --- /dev/null +++ b/1006-rasdaemon-support-memory-corrected-error-statistics.patch @@ -0,0 +1,124 @@ +From 32bd3dc84cc235dc589ae6ac149a3567c7b501a6 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 25 Mar 2025 18:36:07 +0800 +Subject: [PATCH 06/30] rasdaemon: support memory corrected error statistics + +A high volume of Correctable Errors (CEs) indicates that the +memory controller is frequently performing Error-Correcting Code (ECC) +operations, which will increase memory controller latency. +The CE statistics feature can report the number of CEs occurring per +second. When the count exceeds a certain threshold, it signifies +intensive ECC activity and triggers warnings. + +New environment MC_CE_STAT_THRESHOLD to setup threshold. + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 5 +++++ + ras-mc-handler.c | 23 +++++++++++++++++++++++ + ras-mc-handler.h | 1 + + rasdaemon.c | 7 +++++++ + 4 files changed, 36 insertions(+) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 963aaa0..4375781 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -88,3 +88,8 @@ TRIGGER_DIR= + # MC_UE_TRIGGER=mc_event_trigger + MC_CE_TRIGGER= + MC_UE_TRIGGER= ++ ++# CE Statistic Threshold ++# ++# Specify the threshold of CE per second. ++MC_CE_STAT_THRESHOLD=2000 +\ No newline at end of file +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index fdd85a9..7a18f73 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -103,6 +103,27 @@ free: + free(env[i]); + } + ++static unsigned long long per_sec_ce_count; ++unsigned long long mc_ce_stat_threshold; ++static time_t cur; ++static int ras_mc_event_stat(time_t now, struct ras_mc_event *e) ++{ ++ if (strcmp(e->error_type, "Corrected")) ++ return 0; ++ ++ if (cur == now) { ++ per_sec_ce_count += e->error_count; ++ } else { ++ cur = now; ++ per_sec_ce_count = e->error_count; ++ } ++ ++ if (per_sec_ce_count > mc_ce_stat_threshold) ++ log(ALL, LOG_ERR, " mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count); ++ ++ return 0; ++} ++ + int ras_mc_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -263,6 +284,8 @@ int ras_mc_event_handler(struct trace_seq *s, + + ras_store_mc_event(ras, &ev); + ++ ras_mc_event_stat(now, &ev); ++ + #ifdef HAVE_MEMORY_CE_PFA + /* Account page corrected errors */ + if (!strcmp(ev.error_type, "Corrected")) +diff --git a/ras-mc-handler.h b/ras-mc-handler.h +index 2aa3c28..cf12959 100644 +--- a/ras-mc-handler.h ++++ b/ras-mc-handler.h +@@ -10,6 +10,7 @@ + #include + + #include "ras-events.h" ++extern unsigned long long mc_ce_stat_threshold; + + void mc_event_trigger_setup(void); + +diff --git a/rasdaemon.c b/rasdaemon.c +index 840be61..d97665f 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -13,6 +13,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-record.h" ++#include "ras-mc-handler.h" + #include "types.h" + + /* +@@ -23,6 +24,7 @@ + #define TOOL_DESCRIPTION "RAS daemon to log the RAS events." + #define ARGS_DOC "" + #define DISABLE "DISABLE" ++#define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD" + + const char *argp_program_version = TOOL_NAME " " VERSION; + const char *argp_program_bug_address = "Mauro Carvalho Chehab "; +@@ -126,6 +128,11 @@ int main(int argc, char *argv[]) + + choices_disable = getenv(DISABLE); + ++ if (getenv(MC_CE_STAT_THRESHOLD)) ++ mc_ce_stat_threshold = strtoull(getenv(MC_CE_STAT_THRESHOLD), NULL, 0); ++ if (mc_ce_stat_threshold) ++ log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold); ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +-- +2.43.5 + diff --git a/1007-rasdaemon-introduce-poison-page-statistics.patch b/1007-rasdaemon-introduce-poison-page-statistics.patch new file mode 100644 index 0000000000000000000000000000000000000000..12dd35e30ffa21be01e8f81b6af46496eee12fd5 --- /dev/null +++ b/1007-rasdaemon-introduce-poison-page-statistics.patch @@ -0,0 +1,249 @@ +From 9e9a9b7cd802f7874f674fb024ef0dd93e223060 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 26 Mar 2025 14:03:33 +0800 +Subject: [PATCH 07/30] rasdaemon: introduce poison page statistics + +An excessive number of poison pages can lead to memory fragmentation, +which may degrade system performance. This patch introduces a threshold +monitoring mechanism for poison pages. When the number of poison pages +exceeds the predefined threshold, a warning is issued to alert +administrators. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 7 +++++- + configure.ac | 6 ++++++ + misc/rasdaemon.env | 8 ++++++- + ras-memory-failure-handler.c | 5 +++++ + ras-memory-failure-handler.h | 2 ++ + ras-page-isolation.c | 6 ++++++ + ras-poison-page-stat.c | 41 ++++++++++++++++++++++++++++++++++++ + ras-poison-page-stat.h | 14 ++++++++++++ + rasdaemon.c | 9 ++++++++ + 9 files changed, 96 insertions(+), 2 deletions(-) + create mode 100644 ras-poison-page-stat.c + create mode 100644 ras-poison-page-stat.h + +diff --git a/Makefile.am b/Makefile.am +index 1306d97..56e992d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -116,6 +116,10 @@ if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c + endif + ++if WITH_POISON_PAGE_STAT ++ rasdaemon_SOURCES += ras-poison-page-stat.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) + +@@ -125,7 +129,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ +- non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h ++ non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ ++ ras-poison-page-stat.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 25e0cb2..5fe1862 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -254,6 +254,12 @@ AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"]) + ++AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_POISON_PAGE_STAT,1,"have poison page statistics") ++ AC_SUBST([WITH_POISON_PAGE_STAT]) ++]) ++AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 4375781..3aa3a0d 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -92,4 +92,10 @@ MC_UE_TRIGGER= + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +-MC_CE_STAT_THRESHOLD=2000 +\ No newline at end of file ++MC_CE_STAT_THRESHOLD=2000 ++ ++# Poison page statistics ++# ++# Supported units: ++# POISON_STAT_THRESHOLD: kB ++POISON_STAT_THRESHOLD=102400 +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 4d20ce8..d4c293b 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -12,6 +12,7 @@ + + #include "ras-logger.h" + #include "ras-memory-failure-handler.h" ++#include "ras-poison-page-stat.h" + #include "ras-report.h" + #include "trigger.h" + #include "types.h" +@@ -208,6 +209,10 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + ev.action_result = get_action_result(val); + trace_seq_printf(s, "action_result=%s ", ev.action_result); + ++#ifdef HAVE_POISON_PAGE_STAT ++ ras_poison_page_stat(); ++#endif ++ + /* Store data into the SQLite DB */ + #ifdef HAVE_SQLITE3 + ras_store_mf_event(ras, &ev); +diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h +index f0cea71..85e2dd2 100644 +--- a/ras-memory-failure-handler.h ++++ b/ras-memory-failure-handler.h +@@ -11,6 +11,8 @@ + + #include "ras-events.h" + ++extern unsigned long long poison_stat_threshold; ++ + void mem_fail_event_trigger_setup(void); + int ras_memory_failure_event_handler(struct trace_seq *s, + struct tep_record *record, +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 2166f5c..246cd12 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -15,6 +15,8 @@ + + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "ras-poison-page-stat.h" ++#include "ras-record.h" + + #define PARSED_ENV_LEN 50 + #define ROW_ID_MAX_LEN 200 +@@ -349,6 +351,10 @@ static void page_offline(struct page_record *pr) + + log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", + addr, page_state[pr->offlined]); ++ ++#ifdef HAVE_POISON_PAGE_STAT ++ ras_poison_page_stat(); ++#endif + } + + static void page_record(struct page_record *pr, unsigned int count, time_t time) +diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c +new file mode 100644 +index 0000000..2ce1d2a +--- /dev/null ++++ b/ras-poison-page-stat.c +@@ -0,0 +1,41 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-poison-page-stat.h" ++#include "types.h" ++ ++unsigned long long poison_stat_threshold; ++int ras_poison_page_stat(void) ++{ ++ FILE *fp; ++ char line[MAX_PATH]; ++ unsigned long long corrupted_kb = 0; ++ ++ fp = fopen("/proc/meminfo", "r"); ++ if (!fp) { ++ log(ALL, LOG_ERR, "Failed to open /proc/meminfo"); ++ return EXIT_FAILURE; ++ } ++ ++ while (fgets(line, sizeof(line), fp)) ++ if (strstr(line, "HardwareCorrupted")) ++ if (sscanf(line, "%*s %llukB", &corrupted_kb) == 1) ++ break; ++ ++ fclose(fp); ++ ++ if (corrupted_kb > poison_stat_threshold) ++ log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", ++ corrupted_kb, poison_stat_threshold); ++ ++ return 0; ++} +diff --git a/ras-poison-page-stat.h b/ras-poison-page-stat.h +new file mode 100644 +index 0000000..4fe25d2 +--- /dev/null ++++ b/ras-poison-page-stat.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_POISON_PAGE_STAT_H ++#define __RAS_POISON_PAGE_STAT_H ++ ++extern unsigned long long poison_stat_threshold; ++ ++int ras_poison_page_stat(void); ++ ++#endif +diff --git a/rasdaemon.c b/rasdaemon.c +index d97665f..6505dee 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -12,6 +12,7 @@ + + #include "ras-events.h" + #include "ras-logger.h" ++#include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "ras-mc-handler.h" + #include "types.h" +@@ -25,6 +26,7 @@ + #define ARGS_DOC "" + #define DISABLE "DISABLE" + #define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD" ++#define POISON_STAT_THRESHOLD "POISON_STAT_THRESHOLD" + + const char *argp_program_version = TOOL_NAME " " VERSION; + const char *argp_program_bug_address = "Mauro Carvalho Chehab "; +@@ -133,6 +135,13 @@ int main(int argc, char *argv[]) + if (mc_ce_stat_threshold) + log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold); + ++#ifdef HAVE_POISON_PAGE_STAT ++ if (getenv(POISON_STAT_THRESHOLD)) ++ poison_stat_threshold = strtoull(getenv(POISON_STAT_THRESHOLD), NULL, 0); ++ if (poison_stat_threshold) ++ log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold); ++#endif ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +-- +2.43.5 + diff --git a/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch new file mode 100644 index 0000000000000000000000000000000000000000..ee72a7446bf0c4fe82dea7b494a4d696e6295268 --- /dev/null +++ b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch @@ -0,0 +1,468 @@ +From d64ff047a5ab231ee6c1a797dc3ce612fb7a5a6c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 12 Dec 2024 09:37:06 +0800 +Subject: [PATCH 08/30] rasdaemon: erst: decode panic mce through erst + +ERST records the MCE information that caused the kernel panic, +helping us determine the cause of the last crash. +Using rasdaemon to check and parse the ERST records at startup. +Decoded info like follow: + <...>-0 [-01] .... 0.000000 mce_erst_record: 2025-03-26 14:52:42 +0800 bank=1, status= bd80000000100134, Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error, mci=Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR, mca=Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error K, cpu_type= Sapphirerapids server, cpu= 159, socketid= 1, ip= ffffffff914a6476, cs= 10, misc= 86, addr= 8158f58400, mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE, mcgcap= f000c15, apicid= 9f, ppin= fc6b80e0ba9d616, microcode= 2b000571 + +Now environment ERST_DELETE is introduced, rasdaemon will delete +origin erst file if ERST_DELETE set. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 5 +- + configure.ac | 11 +++ + misc/rasdaemon.env | 2 + + ras-erst.c | 195 +++++++++++++++++++++++++++++++++++++++++++++ + ras-erst.h | 17 ++++ + ras-mce-handler.c | 35 ++++++-- + ras-mce-handler.h | 4 + + ras-record.h | 4 + + rasdaemon.c | 11 +++ + 9 files changed, 275 insertions(+), 9 deletions(-) + create mode 100644 ras-erst.c + create mode 100644 ras-erst.h + +diff --git a/Makefile.am b/Makefile.am +index 56e992d..e1bcda1 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -119,6 +119,9 @@ endif + if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c + endif ++if WITH_ERST ++ rasdaemon_SOURCES += ras-erst.c ++endif + + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) +@@ -130,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ++ ras-poison-page-stat.h ras-erst.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 5fe1862..47e6346 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -260,6 +260,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pf + ]) + AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ]) + ++AC_ARG_ENABLE([erst], ++ AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)])) ++ ++AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_ERST,1,"have ERST") ++ AC_SUBST([WITH_ERST]) ++]) ++AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -307,4 +317,5 @@ compile time options summary + YITIAN RAS errors : $USE_YITIAN_NS_DECODE + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE + Signal : $USE_SIGNAL ++ ERST : $USE_ERST + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 3aa3a0d..193ee19 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -99,3 +99,5 @@ MC_CE_STAT_THRESHOLD=2000 + # Supported units: + # POISON_STAT_THRESHOLD: kB + POISON_STAT_THRESHOLD=102400 ++ ++ERST_DELETE=1 +diff --git a/ras-erst.c b/ras-erst.c +new file mode 100644 +index 0000000..c024d60 +--- /dev/null ++++ b/ras-erst.c +@@ -0,0 +1,195 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ras-events.h" ++#include "ras-erst.h" ++#include "ras-logger.h" ++#include "ras-mce-handler.h" ++#include "ras-record.h" ++#include "types.h" ++ ++struct mce { ++ uint64_t status; /* Bank's MCi_STATUS MSR */ ++ uint64_t misc; /* Bank's MCi_MISC MSR */ ++ uint64_t addr; /* Bank's MCi_ADDR MSR */ ++ uint64_t mcgstatus; /* Machine Check Global Status MSR */ ++ uint64_t ip; /* Instruction Pointer when the error happened */ ++ uint64_t tsc; /* CPU time stamp counter */ ++ uint64_t time; /* Wall time_t when error was detected */ ++ uint8_t cpuvendor; /* Kernel's X86_VENDOR enum */ ++ uint8_t inject_flags; /* Software inject flags */ ++ uint8_t severity; /* Error severity */ ++ uint8_t pad; ++ uint32_t cpuid; /* CPUID 1 EAX */ ++ uint8_t cs; /* Code segment */ ++ uint8_t bank; /* Machine check bank reporting the error */ ++ uint8_t cpu; /* CPU number; obsoleted by extcpu */ ++ uint8_t finished; /* Entry is valid */ ++ uint32_t extcpu; /* Linux CPU number that detected the error */ ++ uint32_t socketid; /* CPU socket ID */ ++ uint32_t apicid; /* CPU initial APIC ID */ ++ uint64_t mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ ++ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ ++ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ ++ uint64_t ppin; /* Protected Processor Inventory Number */ ++ uint32_t microcode; /* Microcode revision */ ++}; ++ ++static int erst_delete; ++ ++#define ERST_PATH "/sys/fs/pstore/erst" ++#define MCE_ERST_PREFIX "mce-erst" ++#define ERST_EVENT_NAME "mce_erst_record" ++ ++#ifdef HAVE_MCE ++static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) ++{ ++ struct mce_priv *mce = ras->mce_priv; ++ struct trace_seq s; ++ int rc = 0; ++ ++ switch (mce->cputype) { ++ case CPU_GENERIC: ++ break; ++ case CPU_K8: ++ rc = parse_amd_k8_event(ras, e); ++ break; ++ case CPU_AMD_SMCA: ++ case CPU_DHYANA: ++ rc = parse_amd_smca_event(ras, e); ++ break; ++ default: /* All other CPU types are Intel */ ++ rc = parse_intel_event(ras, e); ++ } ++ ++ if (rc) ++ return; ++ ++ mce_snprintf(e->error_msg, "%s", e->mcastatus_msg); ++ ++ trace_seq_init(&s); ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME); ++ ++ report_mce_event(ras, NULL, &s, e); ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++} ++ ++static void handle_erst_mce_file(char *path, struct mce_event *e) ++{ ++ FILE *file; ++ struct mce mce; ++ struct stat file_stat; ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ log(ALL, LOG_ERR, "Failed to open file %s\n", path); ++ return; ++ } ++ ++ if (stat(path, &file_stat) < 0) { ++ log(ALL, LOG_ERR, "Failed to stat file %s\n", path); ++ goto out; ++ } ++ ++ if (fread((char *)&mce, 1, sizeof(mce), file) < sizeof(mce)) { ++ log(ALL, LOG_ERR, "Failed to read file %s\n", path); ++ goto out; ++ } ++ ++ e->mcgcap = mce.mcgcap; ++ e->mcgstatus = mce.mcgstatus; ++ ++ e->status = mce.status; ++ e->addr = mce.addr; ++ e->misc = mce.misc; ++ e->synd = mce.synd; ++ e->ipid = mce.ipid; ++ e->ip = mce.ip; ++ e->tsc = mce.tsc; ++ e->walltime = mce.time; ++ e->cpu = mce.extcpu; ++ e->cpuid = mce.cpuid; ++ e->apicid = mce.apicid; ++ e->socketid = mce.socketid; ++ e->cs = mce.cs; ++ e->bank = mce.bank; ++ e->cpuvendor = mce.cpuvendor; ++ e->ppin = mce.ppin; ++ e->microcode = mce.microcode; ++ ++ if (erst_delete) { ++ if (!unlink(path)) ++ log(ALL, LOG_INFO, "Error deleting file %s\n", path); ++ else ++ log(ALL, LOG_ERR, "Failed to delete file %s\n", path); ++ } ++ ++out: ++ fclose(file); ++} ++ ++static void handle_erst_mce(void) ++{ ++ int rc; ++ struct ras_events ras = { 0 }; ++ struct dirent *entry; ++ DIR *dir; ++ ++ rc = init_mce_priv(&ras); ++ if (rc) { ++ log(ALL, LOG_INFO, "Can't register mce handler\n"); ++ return; ++ } ++ ++ dir = opendir(ERST_PATH); ++ if (!dir) { ++ log(ALL, LOG_INFO, "Failed to open directory\n"); ++ return; ++ } ++ ++ while ((entry = readdir(dir)) != NULL) { ++ struct stat path_stat; ++ char file_path[MAX_PATH]; ++ struct mce_event mce = { 0 }; ++ ++ mce.erst = 1; ++ if (strncmp(entry->d_name, MCE_ERST_PREFIX, strlen(MCE_ERST_PREFIX))) ++ continue; ++ ++ snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name); ++ stat(file_path, &path_stat); ++ ++ if (S_ISREG(path_stat.st_mode)) { ++ handle_erst_mce_file(file_path, &mce); ++ } else { ++ log(TERM, LOG_ERR, "Unexpected file type\n"); ++ continue; ++ } ++ ++ ras_erst_mce_handler(&ras, &mce); ++ } ++ ++ closedir(dir); ++} ++#endif ++/* ERST just support mce now */ ++void handle_erst(void) ++{ ++ if (getenv(ERST_DELETE)) ++ erst_delete = atoi(getenv(ERST_DELETE)); ++ ++ handle_erst_mce(); ++} +diff --git a/ras-erst.h b/ras-erst.h +new file mode 100644 +index 0000000..83d7535 +--- /dev/null ++++ b/ras-erst.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_ERST_H ++#define __RAS_ERST_H ++ ++#define ERST_DELETE "ERST_DELETE" ++ ++#ifdef HAVE_MCE ++void handle_erst_mce(void); ++#endif ++ ++void handle_erst(void); ++#endif +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 8713390..3d8d97d 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -228,7 +228,7 @@ ret: + return ret; + } + +-int register_mce_handler(struct ras_events *ras, unsigned int ncpus) ++int init_mce_priv(struct ras_events *ras) + { + int rc; + struct mce_priv *mce; +@@ -249,6 +249,11 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + ras->mce_priv = NULL; + return rc; + } ++ ++ return rc; ++} ++static void set_imc_log(struct mce_priv *mce, unsigned int ncpus) ++{ + switch (mce->cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: +@@ -259,6 +264,17 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + default: + break; + } ++} ++ ++int register_mce_handler(struct ras_events *ras, unsigned int ncpus) ++{ ++ int rc; ++ ++ rc = init_mce_priv(ras); ++ if (rc) ++ return rc; ++ ++ set_imc_log(ras->mce_priv, ncpus); + + return rc; + } +@@ -267,9 +283,8 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + * End of mcelog's code + */ + +-static void report_mce_event(struct ras_events *ras, +- struct tep_record *record, +- struct trace_seq *s, struct mce_event *e) ++void report_mce_event(struct ras_events *ras, struct tep_record *record, ++ struct trace_seq *s, struct mce_event *e) + { + time_t now; + struct tm *tm; +@@ -284,10 +299,14 @@ static void report_mce_event(struct ras_events *ras, + * not available (legacy kernels). + */ + +- if (ras->use_uptime) +- now = record->ts / user_hz + ras->uptime_diff; +- else +- now = time(NULL); ++ if (!e->erst) { ++ if (ras->use_uptime) ++ now = record->ts / user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ } else { ++ now = e->walltime; ++ } + + tm = localtime(&now); + if (tm) +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 57984ec..f120874 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -78,6 +78,7 @@ struct mce_event { + char mcastatus_msg[1024]; + char user_action[4096]; + char mc_location[256]; ++ int erst; + }; + + struct mce_priv { +@@ -108,6 +109,7 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus); + int ras_mce_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int init_mce_priv(struct ras_events *ras); + + /* enables intel iMC logs */ + int set_intel_imc_log(enum cputype cputype, unsigned int ncpus); +@@ -170,4 +172,6 @@ int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); + + int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); + ++void report_mce_event(struct ras_events *ras, struct tep_record *record, ++ struct trace_seq *s, struct mce_event *e); + #endif +diff --git a/ras-record.h b/ras-record.h +index 2dd6630..eb5b838 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -28,6 +28,7 @@ struct ras_mc_event { + signed char top_layer, middle_layer, lower_layer; + unsigned long long address, grain, syndrome; + const char *driver_detail; ++ int erst; + }; + + struct ras_mc_offline_event { +@@ -46,6 +47,9 @@ struct ras_aer_event { + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; ++ int erst; ++ uint16_t vendor_id; ++ uint16_t device_id; + }; + + struct ras_extlog_event { +diff --git a/rasdaemon.c b/rasdaemon.c +index 6505dee..be5c390 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -10,6 +10,7 @@ + #include + #include + ++#include "ras-erst.h" + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-poison-page-stat.h" +@@ -225,6 +226,16 @@ int main(int argc, char *argv[]) + if (daemon(0, 0)) + exit(EXIT_FAILURE); + ++#ifdef HAVE_ERST ++#ifdef HAVE_MCE ++ if (choices_disable && strlen(choices_disable) != 0 && ++ strstr(choices_disable, "ras:erst")) ++ log(ALL, LOG_INFO, "Disabled ras:erst from config\n"); ++ else ++ handle_erst(); ++#endif ++#endif ++ + handle_ras_events(args.record_events, args.enable_ipmitool); + + return 0; +-- +2.43.5 + diff --git a/1009-aer-print-pci-device-name-and-vendor-device-id.patch b/1009-aer-print-pci-device-name-and-vendor-device-id.patch new file mode 100644 index 0000000000000000000000000000000000000000..ac5177eaf1b87638bb38f1f189dd066dcd024b01 --- /dev/null +++ b/1009-aer-print-pci-device-name-and-vendor-device-id.patch @@ -0,0 +1,166 @@ +From 5d8df52470036771ee97fa93ea0abcf3c3fbb3f3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 27 Mar 2025 17:27:38 +0800 +Subject: [PATCH 09/30] aer: print pci device name and vendor/device id + +New aer log like follow: + + <...>-2682840 [125] .... 0.017661 aer_event 2025-03-27 +17:34:44 +0800 0000:99:00.0 (Intel Corporation Device 0b60 - +vendor_id: 0x8086 device_id: 0xb60) Data Link Protocol Uncorrected +(Non-Fatal) + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 4 ++-- + configure.ac | 8 ++++++++ + misc/rasdaemon.spec.in | 2 ++ + ras-aer-handler.c | 46 +++++++++++++++++++++++++++++++++++++++++- + ras-record.h | 2 +- + 5 files changed, 58 insertions(+), 4 deletions(-) + +diff --git a/Makefile.am b/Makefile.am +index e1bcda1..2911a21 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -123,8 +123,8 @@ if WITH_ERST + rasdaemon_SOURCES += ras-erst.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) +-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) ++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ +diff --git a/configure.ac b/configure.ac +index 47e6346..3603c7f 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -54,6 +54,14 @@ AC_ARG_ENABLE([aer], + AS_IF([test "x$enable_aer" = "xyes" || test "x$enable_all" = "xyes"], [ + AC_DEFINE(HAVE_AER,1,"have PCIe AER events collect") + AC_SUBST([WITH_AER]) ++ ++ has_libpci_ver=0 ++ dnl check for pciutils library ++ PKG_CHECK_MODULES([LIBPCI], [libpci], [has_libpci_ver=1]) ++ ++ AS_IF([test "$has_libpci_ver" -eq 0], [ ++ AC_MSG_ERROR([libpci is required but were not found]) ++]) + ]) + AM_CONDITIONAL([WITH_AER], [test x$enable_aer = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_AER], [USE_AER="yes"], [USE_AER="no"]) +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 4cc859f..a30045c 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -17,10 +17,12 @@ BuildRequires: perl-generators + BuildRequires: sqlite-devel + BuildRequires: systemd + BuildRequires: libtraceevent-devel ++BuildRequires: pciutils-devel + Provides: bundled(kernel-event-lib) + Requires: hwdata + Requires: perl-DBD-SQLite + Requires: libtraceevent ++Requires: pciutils-devel + %ifarch %{ix86} x86_64 + Requires: dmidecode + %endif +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 5d069f3..53acbc8 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2013 Mauro Carvalho Chehab + */ + ++#include + #include + #include + #include +@@ -63,6 +64,45 @@ void ras_aer_handler_init(int enable_ipmitool) + + #define BUF_LEN 1024 + ++static void get_pci_dev_name(char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id) ++{ ++ struct pci_access *pacc; ++ struct pci_dev *dev; ++ struct pci_filter filter = {0}; ++ char *err; ++ ++ if (!pci_name) ++ return; ++ ++ pacc = pci_alloc(); ++ if (!pacc) ++ return; ++ ++ pci_init(pacc); ++ pci_scan_bus(pacc); ++ pci_filter_init(pacc, &filter); ++ err = pci_filter_parse_slot(&filter, bdf); ++ if (err) { ++ log(TERM, LOG_ERR, "Invalid PCI device name %s\n", bdf); ++ goto free; ++ } ++ ++ for (dev = pacc->devices; dev; dev = dev->next) { ++ if (pci_filter_match(&filter, dev)) { ++ pci_fill_info(dev, PCI_FILL_IDENT); ++ *vendor_id = dev->vendor_id; ++ *device_id = dev->device_id; ++ pci_lookup_name(pacc, pci_name, len, ++ PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE, ++ dev->vendor_id, dev->device_id); ++ break; ++ } ++ } ++ ++free: ++ pci_cleanup(pacc); ++} ++ + int ras_aer_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -75,7 +115,8 @@ int ras_aer_event_handler(struct trace_seq *s, + time_t now; + struct tm *tm; + struct ras_aer_event ev; +- char buf[BUF_LEN]; ++ char buf[BUF_LEN] = { 0 }; ++ uint16_t vendor_id = 0, device_id = 0; + #ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; +@@ -108,6 +149,9 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; + trace_seq_printf(s, "%s ", ev.dev_name); + ++ get_pci_dev_name(ev.dev_name, buf, sizeof(buf), &vendor_id, &device_id); ++ trace_seq_printf(s, "(%s - vendor_id: %#x device_id: %#x) ", buf, vendor_id, device_id); ++ + if (tep_get_field_val(s, event, "status", record, &status_val, 1) < 0) + return -1; + +diff --git a/ras-record.h b/ras-record.h +index eb5b838..ce7d12c 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -43,7 +43,7 @@ struct ras_mc_offline_event { + struct ras_aer_event { + char timestamp[64]; + const char *error_type; +- const char *dev_name; ++ char *dev_name; + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; +-- +2.43.5 + diff --git a/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch new file mode 100644 index 0000000000000000000000000000000000000000..b5a6922e71e6063526fabf05ebbf639d9d7add4f --- /dev/null +++ b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch @@ -0,0 +1,332 @@ +From 921765e3ccd8333c5474000e409dfb0ec80c8f32 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 27 Mar 2025 17:45:16 +0800 +Subject: [PATCH 10/30] rasdaemon: introduce EDPC config in rasdaemon + +System with EDPC enabled device can recovery from fatal aer error. +Rasdaemon now helps users correctly configure EDPC functionality. + +Rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set +to 1. All device with EDPC capability will be enabled by default +if EDPC_DEVICE is specified, only the specified device will be +enabled. For example: + PCIE_EDPC_ENABLE=1 + EDPC_DEVICE=0000:01:00.0 +only enable device 0000:01:00.0. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 4 +- + misc/rasdaemon.env | 11 +++ + ras-pcie-edpc.c | 217 +++++++++++++++++++++++++++++++++++++++++++++ + ras-pcie-edpc.h | 9 ++ + rasdaemon.c | 5 ++ + 5 files changed, 244 insertions(+), 2 deletions(-) + create mode 100644 ras-pcie-edpc.c + create mode 100644 ras-pcie-edpc.h + +diff --git a/Makefile.am b/Makefile.am +index 2911a21..bb3d420 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -56,7 +56,7 @@ if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif + if WITH_AER +- rasdaemon_SOURCES += ras-aer-handler.c ++ rasdaemon_SOURCES += ras-aer-handler.c ras-pcie-edpc.c + endif + if WITH_NON_STANDARD + rasdaemon_SOURCES += ras-non-standard-handler.c +@@ -133,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 193ee19..0516c9c 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -101,3 +101,14 @@ MC_CE_STAT_THRESHOLD=2000 + POISON_STAT_THRESHOLD=102400 + + ERST_DELETE=1 ++ ++# EDPC config ++# ++# rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set to 1 ++# All device with EDPC capability will be enabled by default, ++# if EDPC_DEVICE is specified, only the specified device will be enabled ++# For example: ++# PCIE_EDPC_ENABLE=1 ++# EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0 ++PCIE_EDPC_ENABLE=0 ++EDPC_DEVICE= +diff --git a/ras-pcie-edpc.c b/ras-pcie-edpc.c +new file mode 100644 +index 0000000..4731b05 +--- /dev/null ++++ b/ras-pcie-edpc.c +@@ -0,0 +1,217 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-pcie-edpc.h" ++#include "ras-logger.h" ++#include "types.h" ++ ++#define EDPC_DEVICE "EDPC_DEVICE" ++ ++#define PCI_EXP_DPC_CTL_EN_MASK 0x3 ++ ++static char *edpc_str[] = { ++ [PCI_EXP_DPC_CTL_EN_FATAL] = "Fatal Error", ++ [PCI_EXP_DPC_CTL_EN_NONFATAL] = "Non-Fatal Error", ++}; ++ ++static bool is_cxl_mem_or_cache(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ u32 hdr; ++ u16 vendor, cxl_cap, id; ++ ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DVSEC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return false; ++ ++ hdr = pci_read_long(dev, cap->addr + PCI_DVSEC_HEADER1); ++ vendor = hdr & GENMASK(15, 0); ++ id = pci_read_word(dev, cap->addr + PCI_DVSEC_HEADER2); ++ if (vendor != PCI_DVSEC_VENDOR_ID_CXL || id != PCI_DVSEC_ID_CXL) ++ return false; ++ ++ cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_CAP); ++ if (cxl_cap & (PCI_CXL_CAP_CACHE | PCI_CXL_CAP_MEM)) ++ return true; ++ ++ return false; ++} ++ ++/** ++ * CXL 2.0 RAS spec: 4.2: ++ * Enabling eDPC is not recommended in most CXL 2.0 systems because eDPC ++ * containment flow brings the link down, disrupting CXL.cache and ++ * CXL.mem traffic which can lead to host timeouts. ++ */ ++static void cxl_check_rp(struct pci_dev *dev, struct pci_dev *dpc) ++{ ++ struct pci_dev *dev_p, *dpc_p; ++ for (dev_p = dev->parent; dev_p; dev_p = dev_p->parent) { ++ for (dpc_p = dpc->next; dpc_p; dpc_p = dpc_p->next) { ++ if (dev_p->domain == dpc_p->domain && ++ dev_p->bus == dpc_p->bus && ++ dev_p->dev == dpc_p->dev && ++ dev_p->func == dpc_p->func) { ++ dpc_p->aux = (void *)true; ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x is CXL RP, ignore EDPC config\n", ++ dpc_p->domain, dpc_p->bus, dpc_p->dev, dpc_p->func); ++ } ++ } ++ } ++} ++ ++static bool has_edpc(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ ++ pci_fill_info(dev, PCI_FILL_EXT_CAPS); ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return false; ++ return true; ++} ++ ++static void set_edpc(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ u16 control; ++ int need_config = 0; ++ ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return; ++ ++ control = pci_read_word(dev, cap->addr + PCI_EXP_DPC_CTL); ++ need_config = PCI_DPC_CTL_TRIGGER(control) == PCI_EXP_DPC_CTL_EN_FATAL ? 0 : 1; ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x origin EDPC %s and triggered for %s, %s need config\n", ++ dev->domain, dev->bus, dev->dev, dev->func, ++ (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled", ++ edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK], ++ need_config ? "" : "not"); ++ ++ if (need_config) { ++ control &= PCI_EXP_DPC_CTL_EN_MASK; ++ control |= PCI_EXP_DPC_CTL_EN_FATAL; ++ pci_write_word(dev, cap->addr + PCI_EXP_DPC_CTL, control); ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x EDPC %s and triggered for %s\n", ++ dev->domain, dev->bus, dev->dev, dev->func, ++ (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled", ++ edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK]); ++ } ++} ++ ++static struct pci_filter *config_pcie_edpc_device(struct pci_access *pacc, char *names, int *len) ++{ ++ int i; ++ struct pci_filter *filter = NULL; ++ char *token, *err, pci_names[MAX_PATH + 1]; ++ ++ strscpy(pci_names, names, sizeof(pci_names)); ++ for (i = 0; pci_names[i] != '\0'; i++) ++ if (pci_names[i] == ',') ++ (*len)++; ++ ++ filter = calloc(*len, sizeof(struct pci_filter)); ++ if (!filter) ++ return NULL; ++ ++ i = 0; ++ token = strtok(pci_names, ","); ++ while (token) { ++ pci_filter_init(pacc, &filter[i]); ++ err = pci_filter_parse_slot(&filter[i++], token); ++ if (err) { ++ free(filter); ++ log(TERM, LOG_ERR, "Invalid PCI device name %s\n", err); ++ return NULL; ++ } ++ token = strtok(NULL, ","); ++ } ++ ++ log(TERM, LOG_ERR, "Config PCIE EDPC for: %s\n", names); ++ ++ return filter; ++} ++ ++int config_pcie_edpc(void) ++{ ++ struct pci_access *pacc; ++ struct pci_dev *dev, *dev_head, *tmp; ++ int ret = 0, len = 1, i; ++ char *pci_names; ++ struct pci_filter *filter = NULL; ++ struct pci_dev dev_dpc_head = { 0 }; ++ ++ pacc = pci_alloc(); ++ if (!pacc) ++ return -1; ++ ++ pci_init(pacc); ++ pci_scan_bus(pacc); ++ ++ pci_names = getenv(EDPC_DEVICE); ++ if (pci_names && strlen(pci_names) != 0) { ++ filter = config_pcie_edpc_device(pacc, pci_names, &len); ++ if (!filter) ++ goto free; ++ } else { ++ len = 0; ++ } ++ ++ dev_head = pacc->devices; ++ for (dev = dev_head; dev; dev = dev->next) { ++ pci_fill_info(dev, PCI_FILL_PARENT); ++ if (has_edpc(dev)) { ++ tmp = malloc(sizeof(struct pci_dev)); ++ if (!tmp) { ++ ret = -1; ++ goto free; ++ } ++ ++ memcpy(tmp, dev, sizeof(struct pci_dev)); ++ tmp->next = dev_dpc_head.next; ++ dev_dpc_head.next = tmp; ++ } ++ } ++ ++ for (dev = dev_head; dev; dev = dev->next) ++ if (is_cxl_mem_or_cache(dev)) ++ cxl_check_rp(dev, &dev_dpc_head); ++ ++ for (dev = dev_dpc_head.next; dev; dev = dev->next) { ++ if (!dev->aux) { ++ if (len) { ++ for (i = 0; i < len; i++) { ++ if (pci_filter_match(&filter[i], dev)) { ++ set_edpc(dev); ++ break; ++ } ++ } ++ } else { ++ set_edpc(dev); ++ } ++ } ++ } ++ ++free: ++ while (dev_dpc_head.next) { ++ tmp = dev_dpc_head.next; ++ dev_dpc_head.next = tmp->next; ++ free(tmp); ++ } ++ ++ pci_cleanup(pacc); ++ free(filter); ++ return ret; ++} +diff --git a/ras-pcie-edpc.h b/ras-pcie-edpc.h +new file mode 100644 +index 0000000..a7b96a4 +--- /dev/null ++++ b/ras-pcie-edpc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++ #define PCIE_EDPC_ENABLE "PCIE_EDPC_ENABLE" ++ ++int config_pcie_edpc(void); +diff --git a/rasdaemon.c b/rasdaemon.c +index be5c390..3d4c2ec 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -16,6 +16,7 @@ + #include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "ras-mc-handler.h" ++#include "ras-pcie-edpc.h" + #include "types.h" + + /* +@@ -235,6 +236,10 @@ int main(int argc, char *argv[]) + handle_erst(); + #endif + #endif ++ if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE))) ++ config_pcie_edpc(); ++ else ++ log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n"); + + handle_ras_events(args.record_events, args.enable_ipmitool); + +-- +2.43.5 + diff --git a/1011-rasdaemon-support-nvgpu-event.patch b/1011-rasdaemon-support-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..b2ca511c696647fac28185033b4d7b564edf78f0 --- /dev/null +++ b/1011-rasdaemon-support-nvgpu-event.patch @@ -0,0 +1,511 @@ +From 0696914f490288081325b2a4425de1f0d45c4554 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 11 Apr 2025 13:30:10 +0800 +Subject: [PATCH 11/30] rasdaemon: support nvgpu event + +Use nvml library to report nvgpu event. New environment +NVGPU_DISABLE_EVENT indicate registered events. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 13 +++- + configure.ac | 11 +++ + contrib/nvml.py | 77 ++++++++++++++++++++ + misc/rasdaemon.env | 7 ++ + ras-nvgpu-nvml.c | 178 +++++++++++++++++++++++++++++++++++++++++++++ + ras-nvgpu.c | 54 ++++++++++++++ + ras-nvgpu.h | 14 ++++ + rasdaemon.c | 27 +++++++ + 9 files changed, 380 insertions(+), 2 deletions(-) + create mode 100644 contrib/nvml.py + create mode 100644 ras-nvgpu-nvml.c + create mode 100644 ras-nvgpu.c + create mode 100644 ras-nvgpu.h + +diff --git a/Makefile.am b/Makefile.am +index bb3d420..58ac082 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -17,10 +17,12 @@ EXTRA_DIST = \ + $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ ++ contrib/nvml.py \ + contrib/mc_event_trigger \ + contrib/mem_fail_trigger + + CLEANFILES= \ ++ ras-nvgpu-nvml.h \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ +@@ -123,7 +125,14 @@ if WITH_ERST + rasdaemon_SOURCES += ras-erst.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) ++if WITH_NVGPU ++ BUILT_SOURCES = ras-nvgpu-nvml.h ++ras-nvgpu-nvml.h: contrib/nvml.py ++ python3 $< > $@ ++ rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c ++endif ++ ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ +@@ -133,7 +142,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 3603c7f..43d845d 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -278,6 +278,16 @@ AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) + ++AC_ARG_ENABLE([nvgpu], ++ AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events])) ++ ++AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect") ++ AC_SUBST([WITH_NVGPU]) ++]) ++AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -326,4 +336,5 @@ compile time options summary + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE + Signal : $USE_SIGNAL + ERST : $USE_ERST ++ NVGPU RAS errors : $USE_NVGPU + EOF +diff --git a/contrib/nvml.py b/contrib/nvml.py +new file mode 100644 +index 0000000..9f2c57d +--- /dev/null ++++ b/contrib/nvml.py +@@ -0,0 +1,77 @@ ++import re ++ ++PATH="/usr/local/cuda/include/nvml.h" ++func = ["nvmlInit", ++ "nvmlDeviceGetSupportedEventTypes", ++ "nvmlDeviceRegisterEvents", ++ "nvmlEventSetCreate", ++ "nvmlEventSetWait", ++ "nvmlDeviceGetCount", ++ "nvmlDeviceGetHandleByIndex", ++ "nvmlDeviceGetPciInfo", ++ "nvmlEventSetFree", ++ "nvmlShutdown"] ++ ++pattern = re.compile( ++ r'^nvmlReturn_t DECLDIR\s+({})(\(.*?\));'.format('|'.join(map(re.escape, func))), ++ flags=re.MULTILINE ++) ++ ++type_pattern = re.compile( ++ r'^#define\s+nvmlEventType(\w+)\s+0x.*', ++ flags=re.MULTILINE ++) ++ ++with open(PATH, 'r') as file: ++ content = file.read() ++ matched_lines = pattern.findall(content) ++ type_lines = type_pattern.findall(content) ++ ++func_declares = [] ++func_defs = [] ++func_inits = [] ++type_strs = [] ++ ++for match in matched_lines: ++ func_declares.append('typedef nvmlReturn_t (*my_{}_p){};'.format(match[0], match[1])) ++ func_defs.append('my_{}_p my_{};'.format(match[0], match[0])) ++ func_inits.append('my_{0} = (my_{0}_p)dlsym(handle, "{0}"); \ ++ \n\tif (!my_{0}) {{ \ ++ \n\t\tprintf(\"Failed to load {0}: %s\\n\", dlerror()); \ ++ \n\t\treturn -1; \ ++ \n\t}}'.format(match[0])) ++ ++for type_line in type_lines: ++ type_strs.append('case nvmlEventType{}: return \"{}\";'.format(type_line, type_line)) ++ ++print(''' ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++''' ++) ++print('#include \ ++ \n#include \ ++ \n#include "/usr/local/cuda/include/nvml.h"') ++print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);') ++print('\n'.join(func_declares)) ++print('\nmy_nvmlErrorString_p my_nvmlErrorString;') ++print('\n'.join(func_defs)) ++print('\nstatic int my_nvml_setup(void* handle) \n{{\n\t{}{}\n\treturn 0;\n}}'.format('\n\t'.join(func_inits), ++ '\n\tmy_nvmlErrorString = (my_nvmlErrorString_p)dlsym(handle, "nvmlErrorString"); \ ++ \n\tif (!my_nvmlErrorString) { \ ++ \n\t\tprintf(\"Failed to load nvmlErrorString: %s\\n\", dlerror()); \ ++ \n\t\treturn -1; \ ++ \n\t}')) ++print('\nstatic const char* my_nvmlEventTypeString(unsigned long long type) \n{{ \ ++ \n\n\tswitch (type) {{ \ ++ \n\t{} \ ++ \n\tdefault: return \"Unknown\"; \ ++ \n\t}} \ ++ \n\treturn \"Unknown\"; \ ++ \n}}'.format('\n\t'.join(type_strs))) ++ ++ +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 0516c9c..60544f7 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -112,3 +112,10 @@ ERST_DELETE=1 + # EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0 + PCIE_EDPC_ENABLE=0 + EDPC_DEVICE= ++ ++# Registered event type for nvgpu, default is ++# nvmlEventTypeAll & ~nvmlEventTypeClock ++# ref: https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html ++# For example: ++# NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock ++NVGPU_DISABLE_EVENT="0x10" +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +new file mode 100644 +index 0000000..aabe8f9 +--- /dev/null ++++ b/ras-nvgpu-nvml.c +@@ -0,0 +1,178 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-nvgpu-nvml.h" ++#include "ras-nvgpu.h" ++#include "trace-seq.h" ++#include "types.h" ++ ++#define XID_EVENT_NAME "xid" ++ ++const char *lib_name[] = { ++ "/lib64/libnvidia-ml.so", ++ "/lib64/libnvidia-ml.so.1", ++ "/usr/local/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so", ++ "/usr/local/cuda/targets/sbsa-linux/lib/stubs/libnvidia-ml.so" ++}; ++ ++static void *find_lib(void) ++{ ++ void *handle = NULL; ++ ++ for (int i = 0; i < ARRAY_SIZE(lib_name); i++) { ++ handle = dlopen(lib_name[i], RTLD_LAZY); ++ if (handle) ++ return handle; ++ } ++ ++ log(ALL, LOG_ERR, "Failed to load libnvidia-ml\n"); ++ return NULL; ++} ++ ++static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) ++{ ++ struct trace_seq s; ++ nvmlPciInfo_t pci; ++ time_t now; ++ struct tm *tm; ++ char timestamp[64]; ++ ++ time(&now); ++ tm = localtime(&now); ++ ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ my_nvmlDeviceGetPciInfo(data->device, &pci); ++ ++ trace_seq_init(&s); ++ if (data->eventType == nvmlEventTypeXidCriticalError) { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); ++ trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "xid: %lld ", data->eventData); ++ } else { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); ++ trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType); ++ trace_seq_printf(&s, "data: %lld ", data->eventData); ++ } ++ ++ trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); ++ trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); ++ ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++ ++ return 0; ++} ++ ++int ras_nvgpu_nvml_handle(void) ++{ ++ void *nvml_handle; ++ nvmlReturn_t ret; ++ unsigned int device_count; ++ nvmlDevice_t *devices; ++ nvmlEventSet_t event_set; ++ char *event_types_str = NULL; ++ unsigned long long disable = 0, event_types = 0; ++ nvmlEventData_t event_data; ++ ++ nvml_handle = find_lib(); ++ if (!nvml_handle) { ++ log(ALL, LOG_ERR, "Failed to load libnvidia-ml: %s\n", dlerror()); ++ return 1; ++ } ++ ++ if (my_nvml_setup(nvml_handle)) { ++ log(ALL, LOG_ERR, "Failed to setup libnvidia-ml\n"); ++ dlclose(nvml_handle); ++ return 1; ++ } ++ ++ ret = my_nvmlInit(); ++ if (ret) { ++ log(ALL, LOG_ERR, "NVML Init failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dl; ++ } ++ ++ ret = my_nvmlDeviceGetCount(&device_count); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get device count failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_nvml; ++ } ++ ++ devices = malloc(device_count * sizeof(nvmlDevice_t)); ++ if (!devices) { ++ log(ALL, LOG_ERR, "Failed to allocate memory for devices\n"); ++ goto free_nvml; ++ } ++ ++ for (unsigned int i = 0; i < device_count; i++) { ++ ret = my_nvmlDeviceGetHandleByIndex(i, &devices[i]); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get device handle failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dev; ++ } ++ } ++ ++ ret = my_nvmlEventSetCreate(&event_set); ++ if (ret) { ++ log(ALL, LOG_ERR, "Create event set failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dev; ++ } ++ ++ event_types_str = getenv("NVGPU_DISABLE_EVENT"); ++ if (event_types_str) { ++ disable = strtoull(event_types_str, NULL, 0); ++ log(ALL, LOG_INFO, "Disable NVGPU events %s\n", my_nvmlEventTypeString(disable)); ++ } ++ ++ for (unsigned int i = 0; i < device_count; i++) { ++ ret = my_nvmlDeviceGetSupportedEventTypes(devices[i], &event_types); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get support events failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_event; ++ } ++ ++ ret = my_nvmlDeviceRegisterEvents(devices[i], event_types & ~disable, event_set); ++ if (ret) { ++ log(ALL, LOG_ERR, "Register events failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_event; ++ } ++ } ++ ++ while (1) { ++ ret = my_nvmlEventSetWait(event_set, &event_data, -1); ++ if (!ret) ++ report_ras_gpu_nvml(&event_data, devices); ++ else { ++ log(ALL, LOG_ERR, "Wait for event failed: %s\n", my_nvmlErrorString(ret)); ++ break; ++ } ++ } ++ ++free_event: ++ my_nvmlEventSetFree(event_set); ++free_dev: ++ free(devices); ++free_nvml: ++ my_nvmlShutdown(); ++free_dl: ++ dlclose(nvml_handle); ++ return ret; ++} +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +new file mode 100644 +index 0000000..5c63279 +--- /dev/null ++++ b/ras-nvgpu.c +@@ -0,0 +1,54 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-events.h" ++#include "ras-logger.h" ++#include "ras-nvgpu.h" ++void *ras_nvgpu_handle(void *arg) ++{ ++ (void)arg; ++ sigset_t set; ++ struct stat st; ++ int retry = 3; ++ ++ if (stat("/dev/nvidia0", &st) == -1) { ++ log(ALL, LOG_WARNING, "NVIDIA device not found: %s\n", strerror(errno)); ++ return NULL; ++ } ++ if (!S_ISCHR(st.st_mode)) { ++ log(ALL, LOG_WARNING, "NVIDIA device is not a character device\n"); ++ return NULL; ++ } ++ ++ sigemptyset(&set); ++ sigaddset(&set, SIGINT); ++ sigaddset(&set, SIGTERM); ++ sigaddset(&set, SIGHUP); ++ sigaddset(&set, SIGQUIT); ++ if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) { ++ log(ALL, LOG_ERR, "Failed to set thread signal mask\n"); ++ return NULL; ++ } ++ ++ while (retry--) { ++ if (ras_nvgpu_nvml_handle()) { ++ log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry); ++ sleep(10); ++ } ++ } ++ ++ log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n"); ++ ++ return NULL; ++} +diff --git a/ras-nvgpu.h b/ras-nvgpu.h +new file mode 100644 +index 0000000..32827ad +--- /dev/null ++++ b/ras-nvgpu.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_NVGPU_H ++#define __RAS_NVGPU_H ++ ++#define NVGPU_EVENT_NAME "nvgpu" ++ ++void *ras_nvgpu_handle(void *arg); ++int ras_nvgpu_nvml_handle(void); ++#endif +diff --git a/rasdaemon.c b/rasdaemon.c +index 3d4c2ec..9c5f9dd 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -17,6 +18,7 @@ + #include "ras-record.h" + #include "ras-mc-handler.h" + #include "ras-pcie-edpc.h" ++#include "ras-nvgpu.h" + #include "types.h" + + /* +@@ -241,7 +243,32 @@ int main(int argc, char *argv[]) + else + log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n"); + ++#ifdef HAVE_NVGPU ++ pthread_t nvgpu_thread = 0, main_thread = pthread_self(); ++ bool nvgpu_enable = true; ++ ++ if (choices_disable && strlen(choices_disable) != 0 && ++ strstr(choices_disable, NVGPU_EVENT_NAME)) { ++ nvgpu_enable = false; ++ log(ALL, LOG_INFO, "Disable nvgpu event.\n"); ++ } ++ ++ if (nvgpu_enable) { ++ if (pthread_create(&nvgpu_thread, NULL, ras_nvgpu_handle, &main_thread) != 0) { ++ log(ALL, LOG_ERR, "Failed to create XID thread\n"); ++ pthread_cancel(nvgpu_thread); ++ exit(EXIT_FAILURE); ++ } ++ pthread_detach(nvgpu_thread); ++ log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n"); ++ } ++#endif + handle_ras_events(args.record_events, args.enable_ipmitool); + ++#ifdef HAVE_NVGPU ++ if (nvgpu_enable) ++ pthread_cancel(nvgpu_thread); ++#endif ++ + return 0; + } +-- +2.43.5 + diff --git a/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch new file mode 100644 index 0000000000000000000000000000000000000000..4292bb075257cf6d4c26d11b711444c40a51635d --- /dev/null +++ b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch @@ -0,0 +1,937 @@ +From 9163f3cd0f9344aacf8eb4b061f3ea2269f6c0cb Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 7 Jun 2024 11:26:06 +0800 +Subject: [PATCH 12/30] rasdaemon: enhance rasdaemon event trigger + +- Add trigger timeout to avoid trigger hang. +- Move all trigger code to trigger.c + +Use $(TRIGGER_NAME)_TIMEOUT to set trigger timeout val, for example: + +MC_CE_TRIGGER: The script executed when corrected mc_event occurs. +MC_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_CE_TRIGGER, set 0 to +delete timeout. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 6 +- + contrib/aer_trigger | 27 +++ + contrib/mc_event_trigger | 9 + + contrib/mce_record_trigger | 46 +++++ + contrib/mem_fail_trigger | 21 +- + misc/rasdaemon.env | 23 ++- + ras-aer-handler.c | 3 + + ras-events.c | 18 -- + ras-mc-handler.c | 89 +-------- + ras-mce-handler.c | 3 + + ras-memory-failure-handler.c | 55 +---- + trigger.c | 376 ++++++++++++++++++++++++++++++++--- + trigger.h | 19 +- + 13 files changed, 493 insertions(+), 202 deletions(-) + create mode 100755 contrib/aer_trigger + create mode 100755 contrib/mce_record_trigger + +diff --git a/Makefile.am b/Makefile.am +index 58ac082..72f30b4 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -18,8 +18,7 @@ EXTRA_DIST = \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/nvml.py \ +- contrib/mc_event_trigger \ +- contrib/mem_fail_trigger ++ contrib/*_trigger + + CLEANFILES= \ + ras-nvgpu-nvml.h \ +@@ -171,8 +170,6 @@ install-data-local: + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" +- $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger" +- $(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger" + if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ + fi +@@ -182,3 +179,4 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi ++ $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" +diff --git a/contrib/aer_trigger b/contrib/aer_trigger +new file mode 100755 +index 0000000..87f9da9 +--- /dev/null ++++ b/contrib/aer_trigger +@@ -0,0 +1,27 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occurred, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# ERROR_TYPE Corrected | Uncorrected (Non-Fatal) | Uncorrected (Fatal) ++# DEV_NAME BDF ++# TLP_HEADER_VALID ++# TLP_HEADER ++# MSG ++# ++ ++[ -x ./aer_trigger.local ] && . ./aer_trigger.local ++ ++if [ -d aer_trigger.extern ] ++then ++ ls aer_trigger.extern | ++ while read item ++ do ++ [ -x ./aer_trigger.extern/$item ] && . ./aer_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger +index 9862595..5c68b56 100755 +--- a/contrib/mc_event_trigger ++++ b/contrib/mc_event_trigger +@@ -23,4 +23,13 @@ + + [ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local + ++if [ -d mc_event_trigger.extern ] ++then ++ ls mc_event_trigger.extern | ++ while read item ++ do ++ [ -x ./mc_event_trigger.extern/$item ] && . ./mc_event_trigger.extern/$item ++ done ++fi ++ + exit 0 +diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger +new file mode 100755 +index 0000000..ca49e6d +--- /dev/null ++++ b/contrib/mce_record_trigger +@@ -0,0 +1,46 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# mc_event is occurred, environment variables include all information ++# reported by tracepoint. ++# ++# environment: ++# MCGCAP MCGCAP MSR: machine check capabilities of CPU ++# MCGSTATUS Machine Check Global Status MSR ++# STATUS Bank's MCi_STATUS MSR ++# ADDR Bank's MCi_ADDR MSR ++# MISC Bank's MCi_MISC MSR ++# IP Instruction Pointer when the error happened ++# TSC CPU time stamp counter ++# WALLTIME Wall time_t when error was detected ++# CPU CPU number; obsoleted by extcpu ++# CPUID CPUID 1 EAX ++# APICID CPU initial APIC ID ++# SOCKETID CPU socket ID ++# CS Code segment ++# BANK Machine check bank reporting the error ++# CPUVENDOR Kernel's X86_VENDOR enum ++# SYND MCA_SYND MSR: only valid on SMCA systems ++# IPID MCA_IPID MSR: only valid on SMCA systems ++# TIMESTAMP Rasdaemon timestamp ++# BANK_NAME Decode ban name ++# ERROR_MSG Vendor define error message ++# MCGSTATUS_MSG Decode mcgstatus ++# MCISTATUS_MSG Decode mcistatus ++# MCASTATUS_MSG Decode mcastatus ++# USER_ACTION Recommendations for actions users should take ++# MC_LOCATION Error location in MC ++# ++ ++[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local ++ ++if [ -d mce_record_trigger.extern ] ++then ++ ls mce_record_trigger.extern | ++ while read item ++ do ++ [ -x ./mce_record_trigger.extern/$item ] && . ./mce_record_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/contrib/mem_fail_trigger b/contrib/mem_fail_trigger +index d75ce50..f63df91 100755 +--- a/contrib/mem_fail_trigger ++++ b/contrib/mem_fail_trigger +@@ -1,14 +1,25 @@ + #!/bin/sh + # SPDX-License-Identifier: GPL-2.0 +-# + # This shell script can be executed by rasdaemon in daemon mode when a + # memory_failure_event is occured, environment variables include all + # information reported by tracepoint. ++ ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# PFN Offlined page PFN ++# PAGE_TYPE Page type ++# ACTION_RESULT Action result + # + +-echo TIMESTAMP: $TIMESTAMP +-echo PFN: $PFN +-echo PAGE_TYPE: $PAGE_TYPE +-echo ACTION_RESULT: $ACTION_RESULT ++[ -x ./mf_trigger.local ] && . ./mf_trigger.local ++ ++if [ -d mf_trigger.extern ] ++then ++ ls mf_trigger.extern | ++ while read item ++ do ++ [ -x ./mf_trigger.extern/$item ] && . ./mf_trigger.extern/$item ++ done ++fi + + exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 60544f7..1f5da55 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -83,11 +83,30 @@ TRIGGER_DIR= + + # Execute these triggers when the mc_event occured, the triggers will not + # be executed if the trigger is not specified. ++# You can set timeout for trigger, trigger thread will be killed if timeout. ++# The default timeout is 1, if you do not want any timeout, set it to 0. + # For example: +-# MC_CE_TRIGGER=mc_event_trigger + # MC_UE_TRIGGER=mc_event_trigger +-MC_CE_TRIGGER= ++# MC_UE_TRIGGER_TIMEOUT=1 ++ ++# trigger for mc_event + MC_UE_TRIGGER= ++MC_UE_TRIGGER_TIMEOUT=0 ++ ++MCE_DE_TRIGGER= ++MCE_UE_TRIGGER= ++MCE_DE_TRIGGER_TIMEOUT=0 ++MCE_UE_TRIGGER_TIMEOUT=0 ++ ++MF_TRIGGER= ++MF_TRIGGER_TIMEOUT=0 ++ ++AER_CE_TRIGGER= ++AER_UE_TRIGGER= ++AER_FATAL_TRIGGER= ++AER_CE_TRIGGER_TIMEOUT=0 ++AER_UE_TRIGGER_TIMEOUT=0 ++AER_FATAL_TRIGGER_TIMEOUT=0 + + # CE Statistic Threshold + # +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 53acbc8..471ad9f 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -17,6 +17,7 @@ + #include "ras-report.h" + #include "unified-sel.h" + #include "types.h" ++#include "trigger.h" + + /* bit field meaning for correctable error */ + static const char *aer_cor_errors[32] = { +@@ -254,5 +255,7 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; + #endif + ++ run_aer_event_trigger(&ev); ++ + return 0; + } +diff --git a/ras-events.c b/ras-events.c +index d42ed9f..06f9a37 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -54,13 +54,6 @@ + + char *choices_disable; + +-static const struct event_trigger event_triggers[] = { +- { "mc_event", &mc_event_trigger_setup }, +-#ifdef HAVE_MEMORY_FAILURE +- { "memory_failure_event", &mem_fail_event_trigger_setup }, +-#endif +-}; +- + static int get_debugfs_dir(char *tracing_dir, size_t len) + { + FILE *fp; +@@ -328,17 +321,6 @@ free_ras: + return 0; + } + +-static void setup_event_trigger(char *event) +-{ +- struct event_trigger trigger; +- +- for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) { +- trigger = event_triggers[i]; +- if (!strcmp(event, trigger.name)) +- trigger.setup(); +- } +-} +- + #ifdef HAVE_DISKERROR + #if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL) + /* +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index 7a18f73..a729d93 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -20,89 +20,6 @@ + #include "trigger.h" + #include "types.h" + +-#define MAX_ENV 30 +-static const char *mc_ce_trigger = NULL; +-static const char *mc_ue_trigger = NULL; +- +-void mc_event_trigger_setup(void) +-{ +- const char *trigger; +- +- trigger = getenv("MC_CE_TRIGGER"); +- if (trigger && strcmp(trigger, "")) { +- mc_ce_trigger = trigger_check(trigger); +- +- if (!mc_ce_trigger) { +- log(ALL, LOG_ERR, +- "Cannot access mc_event ce trigger `%s`\n", +- trigger); +- } else { +- log(ALL, LOG_INFO, +- "Setup mc_event ce trigger `%s`\n", +- trigger); +- } +- } +- +- trigger = getenv("MC_UE_TRIGGER"); +- if (trigger && strcmp(trigger, "")) { +- mc_ue_trigger = trigger_check(trigger); +- +- if (!mc_ue_trigger) { +- log(ALL, LOG_ERR, +- "Cannot access mc_event ue trigger `%s`\n", +- trigger); +- } else { +- log(ALL, LOG_INFO, +- "Setup mc_event ue trigger `%s`\n", +- trigger); +- } +- } +-} +- +-static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) +-{ +- char *env[MAX_ENV]; +- int ei = 0; +- int i; +- +- if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) +- goto free; +- if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) +- goto free; +- if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) +- goto free; +- if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) +- goto free; +- if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) +- goto free; +- if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) +- goto free; +- if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) +- goto free; +- if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) +- goto free; +- if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) +- goto free; +- if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) +- goto free; +- if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) +- goto free; +- if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) +- goto free; +- if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) +- goto free; +- if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) +- goto free; +- env[ei] = NULL; +- assert(ei < MAX_ENV); +- +- run_trigger(mc_trigger, NULL, env, "mc_event"); +- +-free: +- for (i = 0; i < ei; i++) +- free(env[i]); +-} +- + static unsigned long long per_sec_ce_count; + unsigned long long mc_ce_stat_threshold; + static time_t cur; +@@ -312,11 +229,7 @@ int ras_mc_event_handler(struct trace_seq *s, + ras_report_mc_event(ras, &ev); + #endif + +- if (mc_ce_trigger && !strcmp(ev.error_type, "Corrected")) +- run_mc_trigger(&ev, mc_ce_trigger); +- +- if (mc_ue_trigger && !strcmp(ev.error_type, "Uncorrected")) +- run_mc_trigger(&ev, mc_ue_trigger); ++ run_mc_event_trigger(&ev); + + return 0; + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 3d8d97d..92c5339 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -17,6 +17,7 @@ + #include "ras-mce-handler.h" + #include "ras-report.h" + #include "types.h" ++#include "trigger.h" + + /* + * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code, +@@ -598,5 +599,7 @@ int ras_mce_event_handler(struct trace_seq *s, + ras_report_mce_event(ras, &e); + #endif + ++ run_mce_record_trigger(&e); ++ + return 0; + } +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index d4c293b..0f4e937 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -87,59 +87,6 @@ static const struct { + { MF_RECOVERED, "Recovered" }, + }; + +-#define MAX_ENV 6 +-static const char *mf_trigger = NULL; +- +-void mem_fail_event_trigger_setup(void) +-{ +- const char *trigger; +- +- trigger = getenv("MEM_FAIL_TRIGGER"); +- if (trigger && strcmp(trigger, "")) { +- mf_trigger = trigger_check(trigger); +- +- if (!mf_trigger) { +- log(ALL, LOG_ERR, +- "Cannot access memory_fail_event trigger `%s`\n", +- trigger); +- } else { +- log(ALL, LOG_INFO, +- "Setup memory_fail_event trigger `%s`\n", +- trigger); +- } +- } +-} +- +-static void run_mf_trigger(struct ras_mf_event *ev) +-{ +- char *env[MAX_ENV]; +- int ei = 0; +- int i; +- +- if (!mf_trigger) +- return; +- +- if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) +- goto free; +- if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) +- goto free; +- if (asprintf(&env[ei++], "PFN=%s", ev->pfn) < 0) +- goto free; +- if (asprintf(&env[ei++], "PAGE_TYPE=%s", ev->page_type) < 0) +- goto free; +- if (asprintf(&env[ei++], "ACTION_RESULT=%s", ev->action_result) < 0) +- goto free; +- +- env[ei] = NULL; +- assert(ei < MAX_ENV); +- +- run_trigger(mf_trigger, NULL, env, "memory_fail_event"); +- +-free: +- for (i = 0; i < ei; i++) +- free(env[i]); +-} +- + static const char *get_page_type(int page_type) + { + unsigned int i; +@@ -222,7 +169,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + /* Report event to ABRT */ + ras_report_mf_event(ras, &ev); + #endif +- run_mf_trigger(&ev); ++ run_mf_event_trigger(&ev); + + return 0; + } +diff --git a/trigger.c b/trigger.c +index aa19a22..a13fffd 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -3,56 +3,378 @@ + #define _GNU_SOURCE + #include + #include ++#include + #include + #include + + #include "ras-logger.h" ++#include "types.h" + #include "trigger.h" + +-void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter) ++#include "ras-mce-handler.h" ++ ++#define MAX_ENV 30 ++static int child_done, alarm_done; ++static char *trigger_dir; ++ ++static void child_handler(int sig) ++{ ++ child_done = 1; ++} ++ ++static void alarm_handler(int sig) ++{ ++ alarm_done = 1; ++} ++ ++void run_trigger(struct event_trigger *t, char *argv[], char **env) + { + pid_t child; +- int status; ++ char *trigger = t->path; ++ const char *path = t->abs_path; ++ int status, timeout = t->timeout; + +- log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); ++ log(ALL, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", ++ trigger, t->event_name); + + child = fork(); + if (child < 0) { +- log(SYSLOG, LOG_ERR, "Cannot create process for trigger"); ++ log(ALL, LOG_ERR, "Cannot create process for trigger\n"); + return; ++ } else if (child == 0) { ++ if (execve(path, argv, env) == -1) ++ log(ALL, LOG_ERR, "Trigger %s exec fail: %s\n", path, strerror(errno)); ++ _exit(EXIT_FAILURE); ++ } ++ ++ signal(SIGCHLD, child_handler); ++ if (timeout) { ++ signal(SIGALRM, alarm_handler); ++ alarm(timeout); + } ++ pause(); + +- if (child == 0) { +- execve(trigger, argv, env); +- _exit(127); +- } else { +- waitpid(child, &status, 0); +- if (WIFEXITED(status) && WEXITSTATUS(status)) { +- log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d", +- trigger, WEXITSTATUS(status)); +- } else if (WIFSIGNALED(status)) { +- log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d", +- trigger, WTERMSIG(status)); ++ if (child_done) { ++ if (waitpid(child, &status, WNOHANG) == child) { ++ if (WIFEXITED(status) && WEXITSTATUS(status)) ++ log(ALL, LOG_INFO, ++ "Trigger %s exited with status %d\n", ++ trigger, WEXITSTATUS(status)); ++ else if (WIFSIGNALED(status)) ++ log(ALL, LOG_INFO, ++ "Trigger %s killed by signal %d\n", ++ trigger, WTERMSIG(status)); + } ++ alarm(0); ++ } else if (alarm_done) { ++ log(ALL, LOG_WARNING, "Trigger timeout, kill it\n"); ++ kill(child, SIGKILL); + } ++ ++ signal(SIGCHLD, SIG_DFL); ++ signal(SIGALRM, SIG_DFL); + } + +-const char *trigger_check(const char *s) ++int trigger_check(struct event_trigger *t) + { +- char *name; +- int rc; +- char *trigger_dir = getenv("TRIGGER_DIR"); ++ if (trigger_dir) ++ if (snprintf(t->abs_path, 256, "%s/%s", trigger_dir, t->path) < 0) ++ return -1; ++ ++ return access(t->abs_path, R_OK | X_OK); ++} ++ ++struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"}; ++ ++struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"}; ++struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"}; + +- if (trigger_dir) { +- if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) +- return NULL; +- s = name; ++struct event_trigger mf_trigger = {"memory_failure_event", "MEM_FAIL_TRIGGER"}; ++ ++struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"}; ++struct event_trigger aer_ue_trigger = {"aer_event", "AER_UE_TRIGGER"}; ++struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; ++ ++static struct event_trigger *event_triggers[] = { ++ &mc_ue_trigger, ++#ifdef HAVE_MCE ++ &mce_de_trigger, ++ &mce_ue_trigger, ++#endif ++#ifdef HAVE_MEMORY_FAILURE ++ &mf_trigger, ++#endif ++#ifdef HAVE_AER ++ &aer_ce_trigger, ++ &aer_ue_trigger, ++ &aer_fatal_trigger, ++#endif ++}; ++ ++void setup_event_trigger(const char *event) ++{ ++ int i, j; ++ struct event_trigger *trigger; ++ char *s, timeout_env[64]; ++ ++ trigger_dir = getenv("TRIGGER_DIR"); ++ ++ for (i = 0; i < ARRAY_SIZE(event_triggers); i++) { ++ trigger = event_triggers[i]; ++ ++ if (strcmp(event, trigger->event_name)) ++ continue; ++ ++ s = getenv(trigger->env); ++ if (!s || !strcmp(s, "")) ++ continue; ++ ++ trigger->path = s; ++ if (trigger_check(trigger)) { ++ log(ALL, LOG_ERR, "Cannot access trigger `%s`: %s\n", s, strerror(errno)); ++ continue; ++ } ++ ++ log(ALL, LOG_NOTICE, "Setup %s trigger `%s`\n", trigger->event_name, s); ++ ++ snprintf(timeout_env, sizeof(timeout_env), "%s_TIMEOUT", trigger->env); ++ ++ trigger->timeout = 1; ++ s = getenv(timeout_env); ++ if (!s || !strcmp(s, "")) { ++ log(ALL, LOG_NOTICE, ++ "Setup %s trigger default timeout 1s\n", ++ trigger->event_name); ++ continue; ++ } ++ ++ j = atoi(s); ++ if (j < 0) ++ log(ALL, LOG_ERR, ++ "Invalid %s trigger timeout `%d` use default value: 1s\n", ++ trigger->event_name, j); ++ else if (j == 0) { ++ log(ALL, LOG_NOTICE, ++ "%s trigger no timeout\n", ++ trigger->event_name); ++ trigger->timeout = 0; ++ } else { ++ log(ALL, LOG_NOTICE, ++ "Setup %s trigger timeout `%d`s\n", ++ trigger->event_name, j); ++ trigger->timeout = j; ++ } + } ++} ++ ++static void __run_mce_trigger(struct mce_event *e, struct event_trigger *trigger) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGCAP=%#lx", e->mcgcap) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS=%#lx", e->mcgstatus) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "STATUS=%#lx", e->status) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDR=%#lx", e->addr) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MISC=%#lx", e->misc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IP=%#lx", e->ip) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TSC=%#lx", e->tsc) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "WALLTIME=%#lx", e->walltime) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPU=%#x", e->cpu) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUID=%#x", e->cpuid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "APICID=%#x", e->apicid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SOCKETID=%#x", e->socketid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CS=%#x", e->cs) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK=%#x", e->bank) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "CPUVENDOR=%#x", e->cpuvendor) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYND=%#lx", e->synd) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "IPID=%#lx", e->ipid) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", e->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BANK_NAME=%s", e->bank_name) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ERROR_MSG=%s", e->error_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCGSTATUS_MSG=%s", e->mcgstatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCISTATUS_MSG=%s", e->mcistatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MCASTATUS_MSG=%s", e->mcastatus_msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "USER_ACTION=%s", e->user_action) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_LOCATION=%s", e->mc_location) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); + +- rc = access(s, R_OK | X_OK); ++ run_trigger(trigger, NULL, env); + +- if (!rc) +- return(s); ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ ++void run_mce_record_trigger(struct mce_event *e) ++{ ++ if (e->status & MCI_STATUS_UC) ++ __run_mce_trigger(e, &mce_ue_trigger); ++ else if (e->status & MCI_STATUS_DEFERRED) ++ __run_mce_trigger(e, &mce_de_trigger); ++} ++ ++static void __run_mc_trigger(struct ras_mc_event *ev, struct event_trigger *trigger) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0, i; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) ++ goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ ++void run_mc_event_trigger(struct ras_mc_event *e) ++{ ++ if (!strcmp(e->error_type, "Uncorrected")) ++ __run_mc_trigger(e, &mc_ue_trigger); ++} ++ ++static void __run_mf_trigger(struct ras_mf_event *ev, struct event_trigger *trigger) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ int i; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; + +- return NULL; ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "PFN=%s", ev->pfn) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "PAGE_TYPE=%s", ev->page_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ACTION_RESULT=%s", ev->action_result) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ ++void run_mf_event_trigger(struct ras_mf_event *e) ++{ ++ __run_mf_trigger(e, &mf_trigger); ++} ++ ++static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *trigger) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ int i; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ERROR_TYPE=%s", ev->error_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DEV_NAME=%s", ev->dev_name) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "TLP_HEADER_VALID=%d", ev->tlp_header_valid) < 0) ++ goto free; ++ if (ev->tlp_header_valid) ++ if (asprintf(&env[ei++], "TLP_HEADER=%08x %08x %08x %08x", ++ ev->tlp_header[0], ev->tlp_header[1], ++ ev->tlp_header[2], ev->tlp_header[3]) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ ++void run_aer_event_trigger(struct ras_aer_event *e) ++{ ++ if (!strcmp(e->error_type, "Corrected")) ++ __run_aer_trigger(e, &aer_ce_trigger); ++ else if (!strcmp(e->error_type, "Uncorrected (Non-Fatal)")) ++ __run_aer_trigger(e, &aer_ue_trigger); ++ else if (!strcmp(e->error_type, "Uncorrected (Fatal)")) ++ __run_aer_trigger(e, &aer_fatal_trigger); + } +diff --git a/trigger.h b/trigger.h +index 7d25042..31eff96 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -3,12 +3,23 @@ + #ifndef __TRIGGER_H__ + #define __TRIGGER_H__ + ++#include "ras-record.h" ++ + struct event_trigger { +- const char *name; +- void (*setup)(void); ++ const char *event_name; ++ const char *env; ++ char *path; ++ char abs_path[256]; ++ int timeout; + }; + +-const char *trigger_check(const char *s); +-void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter); ++int trigger_check(struct event_trigger *t); ++void run_trigger(struct event_trigger *t, char *argv[], char **envr); ++void setup_event_trigger(const char *event); ++ ++void run_mc_event_trigger(struct ras_mc_event *e); ++void run_mce_record_trigger(struct mce_event *e); ++void run_mf_event_trigger(struct ras_mf_event *e); ++void run_aer_event_trigger(struct ras_aer_event *e); + + #endif +-- +2.43.5 + diff --git a/1013-rasdaemon-add-event-level-for-event-record.patch b/1013-rasdaemon-add-event-level-for-event-record.patch new file mode 100644 index 0000000000000000000000000000000000000000..f7a98e95140ad9d67d3b8b8ed5f3f7097e3b083a --- /dev/null +++ b/1013-rasdaemon-add-event-level-for-event-record.patch @@ -0,0 +1,489 @@ +From 06f2f2a77aa546dcd5b0cb002869d08b8a016e5e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 28 Mar 2025 13:19:47 +0800 +Subject: [PATCH] rasdaemon: add event level for event record + +To help users distinguish more and more events, this patch introduces +event levels to indicate the severity of the current event to the +system. Currently, three main levels are used: Alert, Crit, Error. +Fatal events will be marked as "emerg" but in reality, the kernel +will panic upon receiving a fatal event, so rasdaemon does not +receive it. + +ALERT: The uncorrected hardware error has been fixed, but cause + side effects. +CRIT: The uncorrected hardware error has been detected. +ERROR: The corrected hardware error has been detected. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 2 +- + man/rasdaemon.1.in | 15 +++++++++ + ras-aer-handler.c | 22 +++++++++++-- + ras-arm-handler.c | 2 ++ + ras-cxl-handler.c | 7 ++++ + ras-devlink-handler.c | 2 ++ + ras-diskerror-handler.c | 1 + + ras-extlog-handler.c | 20 +++++++++++ + ras-mc-handler.c | 64 +++++++++++++++++++++++------------- + ras-mce-handler.c | 9 +++++ + ras-memory-failure-handler.c | 1 + + ras-nvgpu-nvml.c | 4 +-- + ras-page-isolation.c | 5 +-- + ras-poison-page-stat.c | 4 +-- + ras-signal-handler.c | 4 +-- + types.c | 18 ++++++++++ + types.h | 11 +++++++ + 17 files changed, 156 insertions(+), 35 deletions(-) + create mode 100644 types.c + +diff --git a/Makefile.am b/Makefile.am +index 72f30b4..564a20d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -52,7 +52,7 @@ all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTAT + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +- bitfield.c trigger.c ++ bitfield.c trigger.c types.c + if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index e884e55..2288fd9 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -72,6 +72,21 @@ environment variables. By default the config file is read from /etc/sysconfig/ra + + The general format is environmentname=value. + ++.SH LOG LEVEL ++ ++Each log entry has a level prefix that describes the severity of the log to ++help users determine which logs are more valuable. ++Currently, three levels are used:.TP ++ ++.B "ALERT" ++The uncorrected hardware error has been fixed, but cause side effects. ++.TP ++.B "CRIT" ++The uncorrected hardware error has been detected. ++.TP ++.B "ERROR" ++The corrected hardware error has been detected. ++ + .SH SEE ALSO + \fBras-mc-ctl\fR(8) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 471ad9f..c67f267 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -123,6 +123,25 @@ int ras_aer_event_handler(struct trace_seq *s, + uint8_t sel_data[5]; + int seg, bus, dev, fn, rc; + #endif ++ const char *level; ++ ++ if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0) ++ return -1; ++ switch (severity_val) { ++ case HW_EVENT_AER_UNCORRECTED_NON_FATAL: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case HW_EVENT_AER_UNCORRECTED_FATAL: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case HW_EVENT_AER_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -156,9 +175,6 @@ int ras_aer_event_handler(struct trace_seq *s, + if (tep_get_field_val(s, event, "status", record, &status_val, 1) < 0) + return -1; + +- if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0) +- return -1; +- + /* Fills the error buffer. If it is a correctable error then use the + * aer_cor_errors bit field. Otherwise use aer_uncor_errors. + */ +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index db29327..226feb3 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -489,6 +489,8 @@ int ras_arm_event_handler(struct trace_seq *s, + + memset(&ev, 0, sizeof(ev)); + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); ++ + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 6e5ddea..575fff8 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -133,6 +133,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + struct ras_cxl_poison_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -345,6 +346,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + struct ras_cxl_aer_ue_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_CRIT]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -431,6 +433,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + struct ras_cxl_aer_ce_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -516,6 +519,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct ras_cxl_overflow_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -733,6 +737,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + const uint8_t *buf; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +@@ -848,6 +853,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct ras_cxl_general_media_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +@@ -1038,6 +1044,7 @@ int ras_cxl_dram_event_handler(struct trace_seq *s, + struct ras_cxl_dram_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +diff --git a/ras-devlink-handler.c b/ras-devlink-handler.c +index da5645d..93eba91 100644 +--- a/ras-devlink-handler.c ++++ b/ras-devlink-handler.c +@@ -83,6 +83,8 @@ int ras_devlink_event_handler(struct trace_seq *s, + if (ras->filters[DEVLINK_EVENT] && + tep_filter_match(ras->filters[DEVLINK_EVENT], record) == FILTER_MATCH) + return 0; ++ ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c +index 43c023b..6044efa 100644 +--- a/ras-diskerror-handler.c ++++ b/ras-diskerror-handler.c +@@ -57,6 +57,7 @@ int ras_diskerror_event_handler(struct trace_seq *s, + struct diskerror_event ev; + uint32_t dev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c +index 46c06cf..56acf1a 100644 +--- a/ras-extlog-handler.c ++++ b/ras-extlog-handler.c +@@ -208,6 +208,26 @@ static void report_extlog_mem_event(struct ras_events *ras, + struct trace_seq *s, + struct ras_extlog_event *ev) + { ++ const char *level; ++ ++ switch (ev->severity) { ++ case 0: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case 1: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case 2: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ case 3: ++ level = loglevel_str[LOGLEVEL_INFO]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s", + ev->error_seq, err_severity(ev->severity), + err_type(ev->etype), ev->address, +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index a729d93..e55c199 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -36,7 +36,7 @@ static int ras_mc_event_stat(time_t now, struct ras_mc_event *e) + } + + if (per_sec_ce_count > mc_ce_stat_threshold) +- log(ALL, LOG_ERR, " mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count); ++ log(ALL, LOG_ERR, " mc_event_stat: %s memory corrected error report %lld/sec\n", loglevel_str[LOGLEVEL_ALERT], per_sec_ce_count); + + return 0; + } +@@ -52,6 +52,46 @@ int ras_mc_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_mc_event ev; + int parsed_fields = 0; ++ const char *level; ++ ++ if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0) ++ goto parse_error; ++ parsed_fields++; ++ ++ switch (val) { ++ case HW_EVENT_ERR_CORRECTED: ++ ev.error_type = "Corrected"; ++ break; ++ case HW_EVENT_ERR_UNCORRECTED: ++ ev.error_type = "Uncorrected"; ++ break; ++ case HW_EVENT_ERR_DEFERRED: ++ ev.error_type = "Deferred"; ++ break; ++ case HW_EVENT_ERR_FATAL: ++ ev.error_type = "Fatal"; ++ break; ++ case HW_EVENT_ERR_INFO: ++ default: ++ ev.error_type = "Info"; ++ } ++ ++ switch (val) { ++ case HW_EVENT_ERR_UNCORRECTED: ++ case HW_EVENT_ERR_DEFERRED: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case HW_EVENT_ERR_FATAL: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case HW_EVENT_ERR_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -80,28 +120,6 @@ int ras_mc_event_handler(struct trace_seq *s, + ev.error_count = val; + trace_seq_printf(s, "%d ", ev.error_count); + +- if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0) +- goto parse_error; +- parsed_fields++; +- +- switch (val) { +- case HW_EVENT_ERR_CORRECTED: +- ev.error_type = "Corrected"; +- break; +- case HW_EVENT_ERR_UNCORRECTED: +- ev.error_type = "Uncorrected"; +- break; +- case HW_EVENT_ERR_DEFERRED: +- ev.error_type = "Deferred"; +- break; +- case HW_EVENT_ERR_FATAL: +- ev.error_type = "Fatal"; +- break; +- case HW_EVENT_ERR_INFO: +- default: +- ev.error_type = "Info"; +- } +- + trace_seq_puts(s, ev.error_type); + if (ev.error_count > 1) + trace_seq_puts(s, " errors:"); +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 92c5339..c272bb0 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -290,7 +290,16 @@ void report_mce_event(struct ras_events *ras, struct tep_record *record, + time_t now; + struct tm *tm; + struct mce_priv *mce = ras->mce_priv; ++ const char *level; + ++ if (e->status & MCI_STATUS_UC) ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ else if (e->status & MCI_STATUS_DEFERRED) ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ else ++ level = loglevel_str[LOGLEVEL_ERR]; ++ ++ trace_seq_printf(s, "%s ", level); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 0f4e937..43e7c5d 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -119,6 +119,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_mf_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index aabe8f9..2758d14 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -58,12 +58,12 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + if (data->eventType == nvmlEventTypeXidCriticalError) { + trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", + "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); +- trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); + trace_seq_printf(&s, "xid: %lld ", data->eventData); + } else { + trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", + "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); +- trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); + trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType); + trace_seq_printf(&s, "data: %lld ", data->eventData); + } +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 246cd12..237495c 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -17,6 +17,7 @@ + #include "ras-page-isolation.h" + #include "ras-poison-page-stat.h" + #include "ras-record.h" ++#include "types.h" + + #define PARSED_ENV_LEN 50 + #define ROW_ID_MAX_LEN 200 +@@ -349,8 +350,8 @@ static void page_offline(struct page_record *pr) + + pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; + +- log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", +- addr, page_state[pr->offlined]); ++ log(TERM, LOG_INFO, "%s Result of offlining page at %#llx: %s\n", ++ loglevel_str[LOGLEVEL_ALERT], addr, page_state[pr->offlined]); + + #ifdef HAVE_POISON_PAGE_STAT + ras_poison_page_stat(); +diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c +index 2ce1d2a..c8d8859 100644 +--- a/ras-poison-page-stat.c ++++ b/ras-poison-page-stat.c +@@ -34,8 +34,8 @@ int ras_poison_page_stat(void) + fclose(fp); + + if (corrupted_kb > poison_stat_threshold) +- log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", +- corrupted_kb, poison_stat_threshold); ++ log(ALL, LOG_WARNING, "%s Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", ++ loglevel_str[LOGLEVEL_ALERT], corrupted_kb, poison_stat_threshold); + + return 0; + } +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index fb0bfd3..c497bf0 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -61,8 +61,8 @@ static char *signal_res[] = { + static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev) + { + trace_seq_printf(s, +- "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", +- ev->timestamp, strsignal(ev->sig), ev->error_no, ++ "%s %s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", ++ loglevel_str[LOGLEVEL_ALERT], ev->timestamp, strsignal(ev->sig), ev->error_no, + (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code], + ev->comm, ev->pid, + ev->group, +diff --git a/types.c b/types.c +new file mode 100644 +index 0000000..d4270ac +--- /dev/null ++++ b/types.c +@@ -0,0 +1,18 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include "types.h" ++ ++const char *loglevel_str[] = { ++ [LOGLEVEL_EMERG] = "[EMERG]", ++ [LOGLEVEL_ALERT] = "[ALERT]", ++ [LOGLEVEL_CRIT] = "[CRIT]", ++ [LOGLEVEL_ERR] = "[ERROR]", ++ [LOGLEVEL_WARNING] = "[WARNING]", ++ [LOGLEVEL_NOTICE] = "[NOTICE]", ++ [LOGLEVEL_INFO] = "[INFO]", ++ [LOGLEVEL_DEBUG] = "[DEBUG]", ++}; +\ No newline at end of file +diff --git a/types.h b/types.h +index 58cac1f..8563919 100644 +--- a/types.h ++++ b/types.h +@@ -189,4 +189,15 @@ static inline size_t strscat(char *dst, const char *src, size_t dsize) + "pointer type mismatch in container_of()"); \ + ((type *)(__mptr - offsetof(type, member))); }) + ++#define LOGLEVEL_DEFAULT -1 /* default (or last) loglevel */ ++#define LOGLEVEL_EMERG 0 /* system is unusable */ ++#define LOGLEVEL_ALERT 1 /* action must be taken immediately */ ++#define LOGLEVEL_CRIT 2 /* critical conditions */ ++#define LOGLEVEL_ERR 3 /* error conditions */ ++#define LOGLEVEL_WARNING 4 /* warning conditions */ ++#define LOGLEVEL_NOTICE 5 /* normal but significant condition */ ++#define LOGLEVEL_INFO 6 /* informational */ ++#define LOGLEVEL_DEBUG 7 /* debug-level messages */ ++ ++extern const char *loglevel_str[]; + #endif +-- +2.43.5 + diff --git a/1014-anolis-syslog-add-rasdaemon.ext.patch b/1014-anolis-syslog-add-rasdaemon.ext.patch new file mode 100644 index 0000000000000000000000000000000000000000..9bbc6f3727b9361d9131ae4481c74ea01799bfb3 --- /dev/null +++ b/1014-anolis-syslog-add-rasdaemon.ext.patch @@ -0,0 +1,250 @@ +From b4e1a8c87a7c079c35db5190067808df4ae471a6 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 3 Apr 2025 15:16:09 +0800 +Subject: [PATCH 14/30] anolis: syslog: add rasdaemon.ext + +Filter aer/pcihp/cmcistorm event through syslog-ng/rsyslog + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 24 ++++++++++- + misc/rasdaemon.rsyslog-ext.in | 26 ++++++++++++ + misc/rasdaemon.spec.in | 10 +++++ + misc/rasdaemon.syslog-ng-ext.in | 71 +++++++++++++++++++++++++++++++++ + 5 files changed, 131 insertions(+), 2 deletions(-) + create mode 100644 misc/rasdaemon.rsyslog-ext.in + create mode 100644 misc/rasdaemon.syslog-ng-ext.in + +diff --git a/Makefile.am b/Makefile.am +index 564a20d..ab26412 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -11,17 +11,25 @@ LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in + LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) + RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in + RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog) ++SYSLOG_EXT_SERVICES_IN = misc/rasdaemon.syslog-ng-ext.in ++SYSLOG_EXT_SERVICES = $(SYSLOG_EXT_SERVICES_IN:.syslog-ng-ext.in=.syslog-ng-ext) ++RSYSLOG_EXT_SERVICES_IN = misc/rasdaemon.rsyslog-ext.in ++RSYSLOG_EXT_SERVICES = $(RSYSLOG_EXT_SERVICES_IN:.rsyslog-ext.in=.rsyslog-ext) + EXTRA_DIST = \ + $(SYSTEMD_SERVICES_IN) \ + $(SYSLOG_SERVICES_IN) \ + $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ ++ $(SYSLOG_EXT_SERVICES_IN) \ ++ $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/nvml.py \ + contrib/*_trigger + + CLEANFILES= \ + ras-nvgpu-nvml.h \ ++ misc/rasdaemon.syslog-ng-ext \ ++ misc/rasdaemon.rsyslog-ext \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ +@@ -33,7 +41,7 @@ DISTCLEANFILES = misc/rasdaemon.spec + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog .rsyslog-ext.in .rsyslog-ext .syslog-ng-ext.in .syslog-ng-ext + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + +@@ -46,9 +54,15 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n + .rsyslog.in.rsyslog: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ + ++.syslog-ng-ext.in.syslog-ng-ext: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ ++.rsyslog-ext.in.rsyslog-ext: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) $(SYSLOG_EXT_SERVICES) $(RSYSLOG_EXT_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -179,4 +193,10 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi ++ if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng-ext "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.syslog-ng-ext"; \ ++ fi ++ if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \ ++ fi + $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" +diff --git a/misc/rasdaemon.rsyslog-ext.in b/misc/rasdaemon.rsyslog-ext.in +new file mode 100644 +index 0000000..63cffc2 +--- /dev/null ++++ b/misc/rasdaemon.rsyslog-ext.in +@@ -0,0 +1,26 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++template(name="rasdaemon_temp" type="string" string="%timegenerated% %hostname% rasdaemon: %$!event%: %$!level% %msg%\n") ++ ++if ($syslogfacility-text == "kern" and $msg contains "CMCI storm") then { ++ set $!event = "cmci_storm"; ++ ++ if $msg contains "detected" then set $!level = "[ALERT]"; ++ if $msg contains "subsided" then set $!level = "[ERROR]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} ++ ++if ($syslogfacility-text == "kern" and $msg contains "AER: device recovery") then { ++ set $!event = "aer_recovery"; ++ ++ if $msg contains "failed" then set $!level = "[EMERG]"; ++ if $msg contains "successful" then set $!level = "[ALERT]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} ++ ++if ($syslogfacility-text == "kern" and $msg contains "pciehp: Slot") then { ++ set $!event = "pciehp"; ++ if $msg contains "Link Down" then set $!level = "[ALERT]"; ++ if $msg contains "Card not present" then set $!level = "[ALERT]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index a30045c..521f148 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -57,6 +57,8 @@ install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{na + install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng + install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate + install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog ++install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext ++install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -71,18 +73,24 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng + %config(noreplace) /usr/share/%{name}/%{name}.logrotate + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog ++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext ++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext + + %post + if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; ++ ln -s /usr/share/%{name}/%{name}.syslog-ng-ext %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; + fi + if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; ++ ln -s /usr/share/%{name}/%{name}.rsyslog-ext %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; + fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then +@@ -103,11 +111,13 @@ systemctl disable %{name}.service + if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog-ng service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; + fi + if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; + fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then +diff --git a/misc/rasdaemon.syslog-ng-ext.in b/misc/rasdaemon.syslog-ng-ext.in +new file mode 100644 +index 0000000..ad001d2 +--- /dev/null ++++ b/misc/rasdaemon.syslog-ng-ext.in +@@ -0,0 +1,71 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++destination d_ras { ++ file("/var/log/rasdaemon" ++ template("${DATE} ${HOST} rasdaemon: ${RASDAEMON_EVENT}: ${RASDAEMON_LEVEL} ${MESSAGE}\n") ++ persist-name(ras-ext)); ++}; ++ ++filter f_aer { ++ facility(kern) and ++ match("AER: device recovery" value("MESSAGE")); ++}; ++ ++rewrite r_aer { ++ set("aer_recovery", value("RASDAEMON_EVENT")); ++ set("[EMERG]", value("RASDAEMON_LEVEL") ++ condition(match("failed" value("MESSAGE"))) ++ ); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("successful" value("MESSAGE"))) ++ ); ++}; ++ ++filter f_cmcistorm { ++ facility(kern) and ++ match("CMCI storm" value("MESSAGE")); ++}; ++ ++rewrite r_cmcistorm { ++ set("cmci_storm", value("RASDAEMON_EVENT")); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("detected" value("MESSAGE"))) ++ ); ++ set("[ERROR]", value("RASDAEMON_LEVEL") ++ condition(match("subsided" value("MESSAGE"))) ++ ); ++}; ++ ++filter f_pciehp { ++ facility(kern) and ++ match("pciehp: Slot" value("MESSAGE")); ++}; ++ ++rewrite r_pciehp { ++ set("pciehp", value("RASDAEMON_EVENT")); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("Link Down" value("MESSAGE"))) ++ ); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("Card not present" value("MESSAGE"))) ++ ); ++}; ++ ++log { ++ source(s_sys); ++ junction { ++ channel { ++ filter(f_cmcistorm); ++ rewrite(r_cmcistorm); ++ }; ++ channel { ++ filter(f_pciehp); ++ rewrite(r_pciehp); ++ }; ++ channel { ++ filter(f_aer); ++ rewrite(r_aer); ++ }; ++ }; ++ destination(d_ras); ++}; +\ No newline at end of file +-- +2.43.5 + diff --git a/1015-rasdaemon-add-page-offline-trigger.patch b/1015-rasdaemon-add-page-offline-trigger.patch new file mode 100644 index 0000000000000000000000000000000000000000..20480a043a15f5ababeca2ba1492d6c2d5fc88f1 --- /dev/null +++ b/1015-rasdaemon-add-page-offline-trigger.patch @@ -0,0 +1,238 @@ +From e9995846c39321300a9c89936086222fab3cbb1c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 13 Dec 2024 14:38:02 +0800 +Subject: [PATCH 15/30] rasdaemon: add page offline trigger + +page offline include pre trigger and post trigger. + +Signed-off-by: Ruidong Tian +--- + contrib/page_offline_post_trigger | 25 ++++++++++++++++++ + contrib/page_offline_pre_trigger | 25 ++++++++++++++++++ + misc/rasdaemon.env | 5 ++++ + ras-page-isolation.c | 4 +++ + trigger.c | 43 +++++++++++++++++++++++++++++++ + trigger.h | 6 +++++ + 6 files changed, 108 insertions(+) + create mode 100755 contrib/page_offline_post_trigger + create mode 100755 contrib/page_offline_pre_trigger + +diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger +new file mode 100755 +index 0000000..4d3329c +--- /dev/null ++++ b/contrib/page_offline_post_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occurred, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# ADDR Address ++# OTYPE POST | PRE ++# ++ ++[ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local ++ ++if [ -d page_offline_post_trigger.extern ] ++then ++ ls page_offline_post_trigger.extern | ++ while read item ++ do ++ [ -x ./page_offline_post_trigger.extern/$item ] && . ./page_offline_post_trigger.extern/$item $1 ++ done ++fi ++ ++ ++exit 0 +diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger +new file mode 100755 +index 0000000..e464382 +--- /dev/null ++++ b/contrib/page_offline_pre_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occurred, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# ADDR Address ++# OTYPE POST | PRE ++# ++ ++[ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local ++ ++if [ -d page_offline_pre_trigger.extern ] ++then ++ ls page_offline_pre_trigger.extern | ++ while read item ++ do ++ [ -x ./page_offline_pre_trigger.extern/$item ] && . ./page_offline_pre_trigger.extern/$item $1 ++ done ++fi ++ ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 1f5da55..f3f17c2 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -108,6 +108,11 @@ AER_CE_TRIGGER_TIMEOUT=0 + AER_UE_TRIGGER_TIMEOUT=0 + AER_FATAL_TRIGGER_TIMEOUT=0 + ++PRE_PAGE_OFFLINE_TRIGGER= ++POST_PAGE_OFFLINE_TRIGGER= ++PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 ++POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 237495c..569293f 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -18,6 +18,7 @@ + #include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "types.h" ++#include "trigger.h" + + #define PARSED_ENV_LEN 50 + #define ROW_ID_MAX_LEN 200 +@@ -296,6 +297,7 @@ void ras_page_account_init(void) + { + page_offline_init(); + page_isolation_init(); ++ setup_event_trigger("page_offline"); + } + + static int do_page_offline(unsigned long long addr, enum otype type) +@@ -303,6 +305,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) + int fd, rc; + char buf[20]; + ++ run_page_offline_trigger(addr, type, PRE); + fd = open(kernel_offline[type], O_WRONLY); + if (fd == -1) { + log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, +@@ -318,6 +321,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) + buf, kernel_offline[type], errno); + + close(fd); ++ run_page_offline_trigger(addr, type, POST); + return rc; + } + +diff --git a/trigger.c b/trigger.c +index a13fffd..7387113 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -11,6 +11,7 @@ + #include "types.h" + #include "trigger.h" + ++#include "ras-events.h" + #include "ras-mce-handler.h" + + #define MAX_ENV 30 +@@ -95,6 +96,9 @@ struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"}; + struct event_trigger aer_ue_trigger = {"aer_event", "AER_UE_TRIGGER"}; + struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; + ++struct event_trigger pre_page_offline_trigger = {"page_offline", "PRE_PAGE_OFFLINE_TRIGGER"}; ++struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFFLINE_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -109,6 +113,10 @@ static struct event_trigger *event_triggers[] = { + &aer_ue_trigger, + &aer_fatal_trigger, + #endif ++#ifdef HAVE_MEMORY_CE_PFA ++ &pre_page_offline_trigger, ++ &post_page_offline_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -358,6 +366,32 @@ static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *tr + goto free; + if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0) + goto free; ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (i = 0; i < ei; i++) ++ free(env[i]); ++} ++ ++static void __run_page_offline_trigger(unsigned long long addr, int otype, ++ struct event_trigger *trigger) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ int i; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "ADDR=%#llx", addr) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "OTYPE=%d", otype) < 0) ++ goto free; + + env[ei] = NULL; + assert(ei < MAX_ENV); +@@ -378,3 +412,12 @@ void run_aer_event_trigger(struct ras_aer_event *e) + else if (!strcmp(e->error_type, "Uncorrected (Fatal)")) + __run_aer_trigger(e, &aer_fatal_trigger); + } ++ ++void run_page_offline_trigger(unsigned long long addr, int otype, int type) ++{ ++ if (type == POST) ++ __run_page_offline_trigger(addr, otype, &post_page_offline_trigger); ++ else ++ __run_page_offline_trigger(addr, otype, &pre_page_offline_trigger); ++} ++ +diff --git a/trigger.h b/trigger.h +index 31eff96..74df3d3 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -5,6 +5,11 @@ + + #include "ras-record.h" + ++enum page_offline_trigger_type { ++ PRE, ++ POST, ++}; ++ + struct event_trigger { + const char *event_name; + const char *env; +@@ -21,5 +26,6 @@ void run_mc_event_trigger(struct ras_mc_event *e); + void run_mce_record_trigger(struct mce_event *e); + void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); ++void run_page_offline_trigger(unsigned long long addr, int otype, int type); + + #endif +-- +2.43.5 + diff --git a/1016-anolis-compta-rasdaemon-notices.patch b/1016-anolis-compta-rasdaemon-notices.patch new file mode 100644 index 0000000000000000000000000000000000000000..e13915c789c78aa257f0159edd013fc0a0ad9070 --- /dev/null +++ b/1016-anolis-compta-rasdaemon-notices.patch @@ -0,0 +1,129 @@ +From c1182ad260e0161817d0a4bbea31bcfe5fe7dbd3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 13 Dec 2024 14:38:02 +0800 +Subject: [PATCH 16/30] anolis: compta rasdaemon notices + +page offline include pre trigger and post trigger. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/page_offline_post_trigger | 2 ++ + contrib/page_offline_pre_trigger | 2 ++ + misc/notices/page-ce-offline-post-notice | 16 ++++++++++++++++ + misc/notices/page-ce-offline-pre-notice | 18 ++++++++++++++++++ + misc/rasdaemon.spec.in | 3 +++ + 6 files changed, 42 insertions(+) + create mode 100644 misc/notices/page-ce-offline-post-notice + create mode 100644 misc/notices/page-ce-offline-pre-notice + +diff --git a/Makefile.am b/Makefile.am +index ab26412..61f9a84 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -23,6 +23,7 @@ EXTRA_DIST = \ + $(SYSLOG_EXT_SERVICES_IN) \ + $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ ++ misc/notices \ + contrib/nvml.py \ + contrib/*_trigger + +diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger +index 4d3329c..ad7d44c 100755 +--- a/contrib/page_offline_post_trigger ++++ b/contrib/page_offline_post_trigger +@@ -12,6 +12,8 @@ + + [ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local + ++[ -x /etc/rasdaemon_notices/page-ce-offline-post-notice ] && . /etc/rasdaemon_notices/page-ce-offline-post-notice $(printf "%lu" "$ADDR") ++ + if [ -d page_offline_post_trigger.extern ] + then + ls page_offline_post_trigger.extern | +diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger +index e464382..6d8d3f2 100755 +--- a/contrib/page_offline_pre_trigger ++++ b/contrib/page_offline_pre_trigger +@@ -12,6 +12,8 @@ + + [ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local + ++[ -x /etc/rasdaemon_notices/page-ce-offline-pre-notice ] && . /etc/rasdaemon_notices/page-ce-offline-pre-notice $(printf "%lu" "$ADDR") ++ + if [ -d page_offline_pre_trigger.extern ] + then + ls page_offline_pre_trigger.extern | +diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice +new file mode 100644 +index 0000000..01966af +--- /dev/null ++++ b/misc/notices/page-ce-offline-post-notice +@@ -0,0 +1,16 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon after a page goes offline. ++ ++cd /etc/rasdaemon_notices/ ++ ++[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 ++ ++if [ -d page-ce-offline-post-notice.extern ] ++then ++ ls page-ce-offline-post-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 ++ done ++fi +\ No newline at end of file +diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice +new file mode 100644 +index 0000000..187556c +--- /dev/null ++++ b/misc/notices/page-ce-offline-pre-notice +@@ -0,0 +1,18 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon before a page goes offline. ++ ++cd /etc/rasdaemon_notices/ ++ ++[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 ++ ++if [ -d page-ce-offline-pre-notice.extern ] ++then ++ ls page-ce-offline-pre-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 521f148..23be188 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -59,6 +59,8 @@ install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{na + install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog + install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext + install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext ++install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -75,6 +77,7 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext ++%{_sysconfdir}/rasdaemon_notices/* + + %post + if systemctl is-active --quiet syslog-ng.service; then +-- +2.43.5 + diff --git a/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch new file mode 100644 index 0000000000000000000000000000000000000000..637a05b8de3e0623722d23c6d5129e05b3e06d05 --- /dev/null +++ b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch @@ -0,0 +1,631 @@ +From 637a69ee5de5376eb185ea390cd07d8b9e5d4747 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 9 Dec 2024 16:28:54 +0800 +Subject: [PATCH 17/30] anolis: rasdaemon: add rasdaemon json exporter + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 3 + + configure.ac | 16 +++ + misc/rasdaemon.env | 2 + + ras-aer-handler.c | 9 +- + ras-arm-handler.c | 6 +- + ras-mc-handler.c | 11 +- + ras-mce-handler.c | 7 +- + ras-mce-handler.h | 1 + + ras-memory-failure-handler.c | 6 +- + ras-record.h | 9 ++ + ras-report-json.c | 238 +++++++++++++++++++++++++++++++++++ + ras-report.h | 14 +++ + ras-signal-handler.c | 2 +- + rasdaemon.c | 8 ++ + 14 files changed, 326 insertions(+), 6 deletions(-) + create mode 100644 ras-report-json.c + +diff --git a/Makefile.am b/Makefile.am +index 61f9a84..1f21137 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -104,6 +104,9 @@ endif + if WITH_ABRT_REPORT + rasdaemon_SOURCES += ras-report.c + endif ++if WITH_JSON_REPORT ++ rasdaemon_SOURCES += ras-report-json.c ++endif + if WITH_HISI_NS_DECODE + rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c + endif +diff --git a/configure.ac b/configure.ac +index 43d845d..c5164ec 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -170,6 +170,21 @@ AS_IF([test "x$enable_abrt_report" = "xyes" || test "x$enable_all" = "xyes"], [ + AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_ABRT_REPORT], [USE_ABRT_REPORT="yes"], [USE_ABRT_REPORT="no"]) + ++AC_ARG_ENABLE([json_report], ++ AS_HELP_STRING([--enable-json-report], [enable storing data at SQL lite database (currently experimental)])) ++ ++AS_IF([test "x$enable_json_report" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_CHECK_LIB(pci, pci_lookup_name,[echo "found pci"] , AC_MSG_ERROR([*** Unable to find pci library]), ) ++ PCI_LIBS="-lpci" ++ AC_DEFINE(HAVE_JSON_REPORT,1,"have libpci") ++ AC_SUBST([WITH_JSON_REPORT]) ++]) ++ ++AM_CONDITIONAL([WITH_JSON_REPORT], [test x$enable_json_report = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_JSON_REPORT], [USE_JSON_REPORT="yes"], [USE_JSON_REPORT="no"]) ++ ++AC_SUBST([PCI_LIBS]) ++ + AC_ARG_ENABLE([hisi_ns_decode], + AS_HELP_STRING([--enable-hisi-ns-decode], [enable HISI_NS_DECODE events (currently experimental)])) + +@@ -337,4 +352,5 @@ compile time options summary + Signal : $USE_SIGNAL + ERST : $USE_ERST + NVGPU RAS errors : $USE_NVGPU ++ Json exporter : $USE_JSON_REPORT + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index f3f17c2..085d839 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -73,6 +73,8 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect + CPU_ISOLATION_LIMIT="10" + ++DISABLE="json_report" ++ + # Event Trigger + + # Event trigger will be executed when the specified event occurs. +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index c67f267..023dd4d 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -115,7 +115,7 @@ int ras_aer_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_aer_event ev; ++ struct ras_aer_event ev = { 0 }; + char buf[BUF_LEN] = { 0 }; + uint16_t vendor_id = 0, device_id = 0; + #ifdef HAVE_AMP_NS_DECODE +@@ -207,24 +207,28 @@ int ras_aer_event_handler(struct trace_seq *s, + #ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; + #endif ++ ev.severity = GHES_SEV_RECOVERABLE; + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; + #ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; + #endif ++ ev.severity = GHES_SEV_PANIC; + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; + #ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; + #endif ++ ev.severity = GHES_SEV_CORRECTED; + break; + default: + ev.error_type = "Unknown severity"; + #ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; + #endif ++ ev.severity = GHES_SEV_NO; + } + trace_seq_puts(s, ev.error_type); + +@@ -271,6 +275,9 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; + #endif + ++#ifdef HAVE_JSON_REPORT ++ report_aer_event_json(s, &ev); ++#endif + run_aer_event_trigger(&ev); + + return 0; +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index 226feb3..431dd9b 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -484,7 +484,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_arm_event ev; ++ struct ras_arm_event ev = { 0 }; + int len = 0; + + memset(&ev, 0, sizeof(ev)); +@@ -606,5 +606,9 @@ int ras_arm_event_handler(struct trace_seq *s, + ras_report_arm_event(ras, &ev); + #endif + ++#ifdef HAVE_JSON_REPORT ++ report_arm_event_json(s, &ev); ++#endif ++ + return 0; + } +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index e55c199..2ffaf2e 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -17,6 +17,7 @@ + #include "ras-mc-handler.h" + #include "ras-page-isolation.h" + #include "ras-report.h" ++#include "ras-events.h" + #include "trigger.h" + #include "types.h" + +@@ -50,7 +51,7 @@ int ras_mc_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mc_event ev; ++ struct ras_mc_event ev = { 0 }; + int parsed_fields = 0; + const char *level; + +@@ -61,19 +62,23 @@ int ras_mc_event_handler(struct trace_seq *s, + switch (val) { + case HW_EVENT_ERR_CORRECTED: + ev.error_type = "Corrected"; ++ ev.severity = GHES_SEV_CORRECTED; + break; + case HW_EVENT_ERR_UNCORRECTED: + ev.error_type = "Uncorrected"; ++ ev.severity = GHES_SEV_RECOVERABLE; + break; + case HW_EVENT_ERR_DEFERRED: + ev.error_type = "Deferred"; + break; + case HW_EVENT_ERR_FATAL: + ev.error_type = "Fatal"; ++ ev.severity = GHES_SEV_PANIC; + break; + case HW_EVENT_ERR_INFO: + default: + ev.error_type = "Info"; ++ ev.severity = GHES_SEV_NO; + } + + switch (val) { +@@ -249,6 +254,10 @@ int ras_mc_event_handler(struct trace_seq *s, + + run_mc_event_trigger(&ev); + ++#ifdef HAVE_JSON_REPORT ++ report_mc_event_json(s, &ev); ++#endif ++ + return 0; + + parse_error: +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index c272bb0..b61976a 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -18,6 +18,7 @@ + #include "ras-report.h" + #include "types.h" + #include "trigger.h" ++#include "ras-events.h" + + /* + * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code, +@@ -507,7 +508,7 @@ int ras_mce_event_handler(struct trace_seq *s, + unsigned long long val; + struct ras_events *ras = context; + struct mce_priv *mce = ras->mce_priv; +- struct mce_event e; ++ struct mce_event e = { 0 }; + int rc = 0; + + memset(&e, 0, sizeof(e)); +@@ -608,6 +609,10 @@ int ras_mce_event_handler(struct trace_seq *s, + ras_report_mce_event(ras, &e); + #endif + ++#ifdef HAVE_JSON_REPORT ++ report_mce_event_json(s, &e); ++#endif ++ + run_mce_record_trigger(&e); + + return 0; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index f120874..d2031cf 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -68,6 +68,7 @@ struct mce_event { + int32_t vdata_len; + const uint64_t *vdata; + ++ int severity; + /* Parsed data */ + char frutext[17]; + char timestamp[64]; +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 43e7c5d..df90244 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -117,7 +117,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mf_event ev; ++ struct ras_mf_event ev = { 0 }; + + trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]); + /* +@@ -172,5 +172,9 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + #endif + run_mf_event_trigger(&ev); + ++#ifdef HAVE_JSON_REPORT ++ report_mf_event_json(s, &ev); ++#endif ++ + return 0; + } +diff --git a/ras-record.h b/ras-record.h +index ce7d12c..7f49b74 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -16,6 +16,13 @@ + #include "config.h" + #include "types.h" + ++static const char * const severity_strs[] = { ++ "info", ++ "corrected", ++ "recoverable", ++ "fatal", ++}; ++ + extern long user_hz; + + struct ras_events; +@@ -23,6 +30,7 @@ struct ras_events; + struct ras_mc_event { + char timestamp[64]; + int error_count; ++ int severity; + const char *error_type, *msg, *label; + unsigned char mc_index; + signed char top_layer, middle_layer, lower_layer; +@@ -44,6 +52,7 @@ struct ras_aer_event { + char timestamp[64]; + const char *error_type; + char *dev_name; ++ int severity; + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; +diff --git a/ras-report-json.c b/ras-report-json.c +new file mode 100644 +index 0000000..b1c33a4 +--- /dev/null ++++ b/ras-report-json.c +@@ -0,0 +1,238 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "traceevent/event-parse.h" ++#include "ras-report.h" ++ ++#define NONE "" ++int json_report = 1; ++ ++void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"%s\", " ++ "\"timestamp\": \"%s\", " ++ "\"severity\": \"%s\", " ++ "\"error_count\": %d, " ++ "\"error_type\": \"%s\", " ++ "\"msg\": \"%s\", " ++ "\"label\": \"%s\", " ++ "\"location\": \"%d:%d:%d:%d\", " ++ "\"address\": \"%#llx\", " ++ "\"grain\": \"%#llx\", " ++ "\"syndrome\": \"%#llx\", " ++ "\"driver_detail\": \"%s\" }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ severity_strs[ev->severity], ++ ev->error_count, ++ (ev->error_type) ? ev->error_type : NONE, ++ (ev->msg) ? ev->msg : NONE, ++ (ev->label) ? ev->label : NONE, ++ ev->mc_index, ev->top_layer, ev->middle_layer, ev->lower_layer, ++ ev->address, ++ ev->grain, ++ ev->syndrome, ++ (ev->driver_detail) ? ev->driver_detail : NONE); ++} ++ ++static void get_pci_dev_name(const char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id) ++{ ++ struct pci_access *pacc; ++ struct pci_dev *dev; ++ struct pci_filter filter = {0}; ++ int domain, bus, device, function; ++ ++ pacc = pci_alloc(); ++ if (!pacc) ++ return; ++ pci_init(pacc); ++ pci_scan_bus(pacc); ++ ++ if (!pci_name) ++ goto free; ++ ++ if (sscanf(bdf, "%x:%x.%x", &bus, &device, &function) == 3) ++ domain = 0; ++ else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &device) == 3) ++ function = 0; ++ else if (sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &device, &function) != 4) ++ goto free; ++ ++ pci_filter_init(pacc, &filter); ++ filter.bus = bus; ++ filter.slot = device; ++ filter.func = function; ++ filter.domain = domain; ++ ++ for (dev = pacc->devices; dev; dev = dev->next) { ++ if (pci_filter_match(&filter, dev)) { ++ pci_fill_info(dev, PCI_FILL_IDENT); ++ *vendor_id = dev->vendor_id; ++ *device_id = dev->device_id; ++ pci_lookup_name(pacc, pci_name, len, ++ PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE, ++ dev->vendor_id, dev->device_id); ++ break; ++ } ++ } ++ ++free: ++ pci_cleanup(pacc); ++} ++ ++void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) ++{ ++ char pci_name[128]; ++ u16 vendor = 0, device = 0; ++ ++ if (!s || !ev || !json_report) ++ return; ++ ++ get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device); ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"aer_event\", " \ ++ "\"timestamp\": \"%s\", " \ ++ "\"severity\": \"%s\", " \ ++ "\"error_type\": \"%s\", " \ ++ "\"dev_name\": \"%s\", " \ ++ "\"pci_dev_name\": \"%s\", " \ ++ "\"vendor_id\": \"%#x\", " \ ++ "\"device_id\": \"%#x\", " \ ++ "\"msg\": \"%s\" }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ severity_strs[ev->severity], ++ (ev->error_type) ? ev->error_type : NONE, ++ (ev->dev_name) ? ev->dev_name : NONE, ++ (*pci_name) ? pci_name : NONE, ++ vendor, device, ++ (ev->msg) ? ev->msg : NONE); ++} ++ ++void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"arm_event\", " \ ++ "\"timestamp\": \"%s\", " \ ++ "\"error_count\": %d, " \ ++ "\"affinity\": %d, " \ ++ "\"mpidr\": \"%#lx\", " \ ++ "\"midr\": \"%#lx\", " \ ++ "\"running_state\": %d, " \ ++ "\"psci_state\": %d }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ ev->error_count, ++ ev->affinity, ++ ev->mpidr, ++ ev->midr, ++ ev->running_state, ++ ev->psci_state); ++} ++ ++void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"mf_event\", \"timestamp\": \"%s\", " ++ "\"pfn\": %s, \"page_type\": \"%s\", " ++ "\"action_result\": \"%s\" }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ (*ev->pfn) ? ev->pfn : NONE, ++ (ev->page_type) ? ev->page_type : NONE, ++ (ev->action_result) ? ev->action_result : NONE); ++} ++ ++void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ if (ev->status & MCI_STATUS_UC) ++ ev->severity = GHES_SEV_RECOVERABLE; ++ else if (ev->status & MCI_STATUS_DEFERRED) ++ ev->severity = GHES_SEV_RECOVERABLE; ++ else ++ ev->severity = GHES_SEV_CORRECTED; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"%s\", " ++ "\"timestamp\": \"%s\", " ++ "\"severity\": \"%s\", " ++ "\"bank\": %d, " ++ "\"bank_name\": \"%s\", " ++ "\"status\": \"%#lx\", " ++ "\"error_msg\": \"%s\", " ++ "\"mcistatus_msg\": \"%s\", " ++ "\"mcastatus_msg\": \"%s\", " ++ "\"user_action\": \"%s\", " ++ "\"mc_location\": \"%s\", " ++ "\"cpuid\": \"%#x\", " ++ "\"cpu\": %d, " ++ "\"socketid\": %d, " ++ "\"ip\": \"%#lx\", " ++ "\"cs\": \"%#x\", " ++ "\"misc\": \"%#lx\", " ++ "\"addr\": \"%#lx\", " ++ "\"synd\": \"%#lx\", " ++ "\"ipid\": \"%#lx\", " ++ "\"mcgstatus_msg\": \"%s\", " ++ "\"mcgstatus\": \"%#lx\", " ++ "\"mcgcap\": \"%#lx\", " ++ "\"apicid\": \"%#x\" }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ severity_strs[ev->severity], ++ ev->bank, ++ (*ev->bank_name) ? ev->bank_name : NONE, ++ ev->status, ++ (*ev->error_msg) ? ev->error_msg : NONE, ++ (*ev->mcistatus_msg) ? ev->mcistatus_msg : NONE, ++ (*ev->mcastatus_msg) ? ev->mcastatus_msg : NONE, ++ (*ev->user_action) ? ev->user_action : NONE, ++ (*ev->mc_location) ? ev->mc_location : NONE, ++ ev->cpuid, ++ ev->cpu, ++ ev->socketid, ++ ev->ip, ++ ev->cs, ++ ev->misc, ++ ev->addr, ++ ev->synd, ++ ev->ipid, ++ (*ev->mcgstatus_msg) ? ev->mcgstatus_msg : NONE, ++ ev->mcgstatus, ++ ev->mcgcap, ++ ev->apicid); ++} ++ +diff --git a/ras-report.h b/ras-report.h +index f680a25..eeb25bb 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -23,6 +23,12 @@ + /* ABRT socket file */ + #define ABRT_SOCKET "/var/run/abrt/abrt.socket" + ++#ifdef HAVE_JSON_REPORT ++#define JSON_REPORT_KEY "rasdaemon_event_name" ++ ++extern int json_report; ++#endif ++ + #ifdef HAVE_ABRT_REPORT + + int ras_report_mc_event(struct ras_events *ras, +@@ -115,4 +121,12 @@ static inline int ras_report_signal_event(struct ras_events *ras, + { return 0; }; + #endif + ++#ifdef HAVE_JSON_REPORT ++void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev); ++void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev); ++void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev); ++void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); ++void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); ++#endif ++ + #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index e8f7f1d..d15c4f6 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -78,7 +78,7 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_signal_event ev; ++ struct ras_signal_event ev = { 0 }; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +diff --git a/rasdaemon.c b/rasdaemon.c +index 9c5f9dd..d5d2f85 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -16,6 +16,7 @@ + #include "ras-logger.h" + #include "ras-poison-page-stat.h" + #include "ras-record.h" ++#include "ras-report.h" + #include "ras-mc-handler.h" + #include "ras-pcie-edpc.h" + #include "ras-nvgpu.h" +@@ -146,6 +147,13 @@ int main(int argc, char *argv[]) + log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold); + #endif + ++#ifdef HAVE_JSON_REPORT ++ if (choices_disable && ++ strlen(choices_disable) != 0 && ++ strstr(choices_disable, "json_report")) ++ json_report = 0; ++#endif ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +-- +2.43.5 + diff --git a/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch new file mode 100644 index 0000000000000000000000000000000000000000..19a3fd5ceedb8a3c7913d248426f048a69ed5370 --- /dev/null +++ b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch @@ -0,0 +1,998 @@ +From 340a8af496dd80a719e27e6395f96c8d75cf6f36 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 11 Dec 2024 16:16:30 +0800 +Subject: [PATCH 18/30] anolis: rasdaemon: kmsg_monitor: introduce kmsg_monitor + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 6 +- + configure.ac | 11 +++ + misc/rasdaemon.env | 43 +++++++++- + ras-events.c | 114 +++++++++++++++++++++++-- + ras-kmsg.c | 203 +++++++++++++++++++++++++++++++++++++++++++++ + ras-kmsg.h | 47 +++++++++++ + ras-report-json.c | 68 ++++++++++++++- + ras-report.h | 2 + + ras-time.c | 103 +++++++++++++++++++++++ + ras-time.h | 27 ++++++ + rasdaemon.c | 14 ++++ + trigger.c | 55 ++++++++++++ + trigger.h | 3 + + 13 files changed, 685 insertions(+), 11 deletions(-) + create mode 100644 ras-kmsg.c + create mode 100644 ras-kmsg.h + create mode 100644 ras-time.c + create mode 100644 ras-time.h + +diff --git a/Makefile.am b/Makefile.am +index 1f21137..68b354b 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -134,6 +134,9 @@ endif + if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c + endif ++if WITH_KMSG_MONITOR ++ rasdaemon_SOURCES += ras-kmsg.c ras-time.c ++endif + + if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c +@@ -159,7 +162,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h \ ++ ras-kmsg.h ras-time.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index c5164ec..dfb7f02 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -303,6 +303,16 @@ AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) + ++AC_ARG_ENABLE([kmsg_monitor], ++ AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)])) ++ ++AS_IF([test "x$enable_kmsg_monitor" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_KMSG_MONITOR,1,"have kmsg monitor") ++ AC_SUBST([WITH_KMSG_MONITOR]) ++]) ++AM_CONDITIONAL([WITH_KMSG_MONITOR], [test x$enable_kmsg_monitor = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_KMSG_MONITOR], [USE_KMSG_MONITOR="yes"], [USE_KMSG_MONITOR="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -353,4 +363,5 @@ compile time options summary + ERST : $USE_ERST + NVGPU RAS errors : $USE_NVGPU + Json exporter : $USE_JSON_REPORT ++ Kmsg monitor : $USE_KMSG_MONITOR + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 085d839..f498e24 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect + CPU_ISOLATION_LIMIT="10" + +-DISABLE="json_report" ++DISABLE="json_report,kmsg_monitor" + + # Event Trigger + +@@ -115,6 +115,10 @@ POST_PAGE_OFFLINE_TRIGGER= + PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + ++#trigger for kmsg ++KMSG_TRIGGER= ++KMSG_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +@@ -145,3 +149,40 @@ EDPC_DEVICE= + # For example: + # NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock + NVGPU_DISABLE_EVENT="0x10" ++ ++ ++# KMSG MONITOR ++KMSG_IGNORE_XID="" ++KMSG_LIMIT=100 ++KMSG_TRACE_NUM=6 ++KMSG_TRACE_END=1 ++ ++KMSG_TRACER_NAME_0="xid" ++KMSG_TRACER_REGEX_0="NVRM: Xid \\(PCI:(.*)( GPU-I:[0-9]+)?( GPU-CI:[0-9]+)?\\): ([0-9]+), pid=([^,]*)(, name=([^,]*))?, (.*)" ++KMSG_TRACER_GROUP_COUNT_0=8 ++KMSG_TRACER_GROUP_KEY_0="pci_port,gpu-i,gpu-ci,xid,pid,has_name,name,msg" ++ ++KMSG_TRACER_NAME_1="sxid" ++KMSG_TRACER_REGEX_1="nvidia-nvswitch[0-9]+: SXid \\(PCI:(.*)\\): ([0-9]+), (.*)" ++KMSG_TRACER_GROUP_COUNT_1=3 ++KMSG_TRACER_GROUP_KEY_1="pci_port,xid,msg" ++ ++KMSG_TRACER_NAME_2="axid" ++KMSG_TRACER_REGEX_2="PPU.* Xid \\((.*)\\): ([0-9]+)(, pid=([^,]*))?, (.*)" ++KMSG_TRACER_GROUP_COUNT_2=5 ++KMSG_TRACER_GROUP_KEY_2="pci_port,xid,has_pid,pid,msg" ++ ++KMSG_TRACER_NAME_3="aer_recovery" ++KMSG_TRACER_REGEX_3="pcieport (.*): AER: device recovery (successful|failed)" ++KMSG_TRACER_GROUP_COUNT_3=2 ++KMSG_TRACER_GROUP_KEY_3="pci_port,res" ++ ++KMSG_TRACER_NAME_4="pcihp" ++KMSG_TRACER_REGEX_4="pcieport (.*): pciehp: Slot\\(([0-9]+)\\): (Link Up|Link Down|Card present|Card not present|Link Down/Up ignored \\(recovered by DPC\\))" ++KMSG_TRACER_GROUP_COUNT_4=3 ++KMSG_TRACER_GROUP_KEY_4="pci_port,slot,res" ++ ++KMSG_TRACER_NAME_5="cmci_storm" ++KMSG_TRACER_REGEX_5="CMCI storm (.*): switching to .* mode" ++KMSG_TRACER_GROUP_COUNT_5=1 ++KMSG_TRACER_GROUP_KEY_5="storm" +diff --git a/ras-events.c b/ras-events.c +index 06f9a37..d40f29e 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -14,6 +14,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -37,6 +39,25 @@ + #include "ras-signal-handler.h" + #include "ras-record.h" + #include "trigger.h" ++#include "ras-kmsg.h" ++ ++#ifdef HAVE_KMSG_MONITOR ++#define NS_PER_SEC 1000000000L ++ ++static struct timespec ts_sub(struct timespec a, struct timespec b) ++{ ++ struct timespec result = { ++ .tv_sec = a.tv_sec - b.tv_sec, ++ .tv_nsec = a.tv_nsec - b.tv_nsec ++ }; ++ ++ if (result.tv_nsec < 0) { ++ result.tv_sec -= 1; ++ result.tv_nsec += NS_PER_SEC; ++ } ++ return result; ++} ++#endif + + /* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never +@@ -464,12 +485,22 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + int ready, i, count_nready; + struct kbuffer *kbuf; + void *page; +- struct pollfd fds[n_cpus + 1]; + struct signalfd_siginfo fdsiginfo; + sigset_t mask; + int warnonce[n_cpus]; + char pipe_raw[PATH_MAX]; + int legacy_kernel = 0; ++#ifdef HAVE_KMSG_MONITOR ++ int fd_num = n_cpus + 2; ++ char kmsg_buf[PRINTK_MESSAGE_MAX]; ++ int limit = 0; ++ struct timespec limit_time = { 0 }; ++ int need_sleep = 0; ++#else ++ int fd_num = n_cpus + 1; ++#endif ++ struct pollfd fds[fd_num]; ++ + + memset(&warnonce, 0, sizeof(warnonce)); + +@@ -496,7 +527,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + if (set_buffer_percent(pdata[0].ras, 0)) + log(TERM, LOG_WARNING, "Set buffer_percent failed\n"); + +- for (i = 0; i < (n_cpus + 1); i++) ++ for (i = 0; i < fd_num; i++) + fds[i].fd = -1; + + for (i = 0; i < n_cpus; i++) { +@@ -527,6 +558,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + goto error; + } + ++#ifdef HAVE_KMSG_MONITOR ++ if (kmsg_monitor) { ++ fds[n_cpus + 1].events = POLLIN; ++ fds[n_cpus + 1].fd = open("/dev/kmsg", O_RDONLY); ++ if (fds[n_cpus + 1].fd < 0) { ++ log(TERM, LOG_ERR, "open /dev/kmsg\n"); ++ goto error; ++ } ++ ++ if (kmsg_trace_end) { ++ off_t offset = lseek(fds[n_cpus + 1].fd, 0, SEEK_END); ++ ++ if (offset == -1) { ++ log(TERM, LOG_ERR, "Can not seek kmsg end\n"); ++ goto error; ++ } ++ } ++ } ++#endif ++ + log(TERM, LOG_INFO, "Listening to events for cpus 0 to %d\n", n_cpus - 1); + if (pdata[0].ras->record_events) { + if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras)) +@@ -538,7 +589,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + } + + do { +- ready = poll(fds, (n_cpus + 1), -1); ++ ready = poll(fds, fd_num, -1); + if (ready < 0) + log(TERM, LOG_WARNING, "poll\n"); + +@@ -564,6 +615,40 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + } + + count_nready = 0; ++#ifdef HAVE_KMSG_MONITOR ++ /* read from kmsg */ ++ if (kmsg_monitor && (fds[n_cpus + 1].revents & POLLIN)) { ++ size = read(fds[n_cpus + 1].fd, kmsg_buf, PRINTK_MESSAGE_MAX); ++ if (size < 0) { ++ log(TERM, LOG_WARNING, "read kmsg %s\n", strerror(errno)); ++ } else if (size > 0) { ++ kmsg_buf[size] = '\0'; ++ kmsg_match(kmsg_buf); ++ memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX); ++ } else { ++ count_nready++; ++ } ++ limit++; ++ if (kmsg_limit && limit >= kmsg_limit) { ++ struct timespec tv, res; ++ ++ clock_gettime(CLOCK_MONOTONIC, &tv); ++ ++ res = ts_sub(tv, limit_time); ++ if (res.tv_sec == 0 && res.tv_nsec >= 0 && res.tv_nsec < (0.5 * NS_PER_SEC)) { ++ need_sleep = 1; ++ log(TERM, LOG_WARNING, "kmsg limit %lx!\n", res.tv_nsec); ++ } ++ ++ limit = 0; ++ limit_time = tv; ++ } ++ ++ } else { ++ count_nready++; ++ } ++#endif ++ + for (i = 0; i < n_cpus; i++) { + if (fds[i].revents & POLLERR) { + if (!warnonce[i]) { +@@ -599,11 +684,18 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + count_nready++; + } + } ++#ifdef HAVE_KMSG_MONITOR ++ if (need_sleep) { ++ usleep(500000); ++ need_sleep = 0; ++ } ++#endif ++ + /* + * If we enable fallback mode, it will always be used, as + * poll is still not working fine, IMHO + */ +- if (count_nready == n_cpus) { ++ if (count_nready == fd_num) { + /* Should only happen with legacy kernels */ + legacy_kernel = 1; + break; +@@ -627,7 +719,7 @@ error: + free(page); + sigprocmask(SIG_UNBLOCK, &mask, NULL); + +- for (i = 0; i < (n_cpus + 1); i++) { ++ for (i = 0; i < fd_num; i++) { + if (fds[i].fd > 0) + close(fds[i].fd); + } +@@ -991,6 +1083,13 @@ int handle_ras_events(int record_events, int enable_ipmitool) + ras_page_account_init(); + #endif + ++#ifdef HAVE_KMSG_MONITOR ++ if (kmsg_monitor) { ++ if (kmsg_tracer_init()) ++ goto err; ++ } ++#endif ++ + rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", + ras_mc_event_handler, NULL, MC_EVENT); + if (!rc) +@@ -1269,5 +1368,10 @@ err: + #ifdef HAVE_MEMORY_ROW_CE_PFA + row_record_infos_free(); + #endif ++#ifdef HAVE_KMSG_MONITOR ++ if (kmsg_monitor) ++ kmsg_tracer_destroy(); ++#endif ++ + return rc; + } +diff --git a/ras-kmsg.c b/ras-kmsg.c +new file mode 100644 +index 0000000..2dd47d6 +--- /dev/null ++++ b/ras-kmsg.c +@@ -0,0 +1,203 @@ ++#define _GNU_SOURCE ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-report.h" ++#include "ras-kmsg.h" ++#include "trigger.h" ++ ++int kmsg_monitor = 1; ++int kmsg_trace_end; ++int kmsg_limit; ++ ++struct kmsg_tracer_info *kmsg_tracer; ++int kmsg_tracer_num; ++ ++int kmsg_match(char *msg) ++{ ++ int ret, group_count, i; ++ regex_t *regex; ++ regmatch_t *matches; ++ char tmpbuf[256]; ++ ++ for (i = 0; i < kmsg_tracer_num; i++) { ++ regex = &kmsg_tracer[i].regex_c; ++ matches = &kmsg_tracer[i].matches[0]; ++ group_count = kmsg_tracer[i].group_count; ++ ++ ret = regexec(regex, msg, group_count, matches, 0); ++ if (ret > REG_NOMATCH) { ++ regerror(ret, regex, tmpbuf, sizeof(tmpbuf)); ++ log(ALL, LOG_ERR, "Regex execution error: %s\n", tmpbuf); ++ return 1; ++ } else if (ret == REG_NOMATCH) { ++ continue; ++ } ++ ++#ifdef HAVE_JSON_REPORT ++ report_kmsg_event_json(&kmsg_tracer[i], msg); ++#endif ++ run_kmsg_trigger(kmsg_tracer, msg); ++ ++ break; ++ } ++ ++ return 0; ++} ++ ++int kmsg_tracer_destroy(void) ++{ ++ log(ALL, LOG_INFO, "kmsg tracer destroy\n"); ++ ++ if (!kmsg_tracer) ++ return 0; ++ for (int i = 0; i < kmsg_tracer_num; i++) { ++ if (!kmsg_tracer[i].name) ++ free(kmsg_tracer[i].name); ++ if (!kmsg_tracer[i].regex) ++ free(kmsg_tracer[i].regex); ++ if (!kmsg_tracer[i].matches) ++ free(kmsg_tracer[i].matches); ++ if (!kmsg_tracer[i].group_key) ++ continue; ++ for (int j = 0; j < kmsg_tracer[i].group_count; j++) ++ if (!kmsg_tracer[i].group_key[j]) ++ free(kmsg_tracer[i].group_key[j]); ++ else ++ continue; ++ if (!kmsg_tracer[i].group_key) ++ free(kmsg_tracer[i].group_key); ++ } ++ free(kmsg_tracer); ++ ++ return 0; ++} ++ ++int kmsg_tracer_init(void) ++{ ++ char *s; ++ int kmsg_tracer_group_count, ret, c = 0; ++ char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp; ++ char *kmsg_tracer_group_key, *token; ++ ++ s = getenv(KMSG_TRACE_END); ++ if (!s) ++ kmsg_trace_end = 0; ++ else ++ kmsg_trace_end = atoi(s); ++ ++ s = getenv(KMSG_TRACE_NUM); ++ if (!s) ++ return 0; ++ ++ kmsg_tracer_num = atoi(s); ++ if (kmsg_tracer_num <= 0) ++ return 0; ++ ++ s = getenv(KMSG_LIMIT); ++ if (s) { ++ kmsg_limit = atoi(s); ++ if (kmsg_limit < 0) ++ return -1; ++ } ++ ++ kmsg_tracer = calloc(kmsg_tracer_num, sizeof(struct kmsg_tracer_info)); ++ if (!kmsg_tracer) ++ return -1; ++ ++ for (int i = 0; i < kmsg_tracer_num; i++) { ++ // trace name ++ snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_NAME, i); ++ kmsg_tracer_name = getenv(buf); ++ if (!kmsg_tracer_name || ((strlen(kmsg_tracer_name) > NAME_LEN))) ++ return -1; ++ kmsg_tracer[i].name = strdup(kmsg_tracer_name); ++ if (!kmsg_tracer[i].name) ++ return -1; ++ ++ // tracer regex ++ snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_REGEX, i); ++ kmsg_tracer_regex = getenv(buf); ++ if (!kmsg_tracer_regex || (strlen(kmsg_tracer_regex) > BUF_LEN)) ++ return -1; ++ snprintf(buf, 1026, "%s\n", kmsg_tracer_regex); ++ kmsg_tracer[i].regex = strdup(buf); ++ if (!kmsg_tracer[i].regex) ++ return -1; ++ ++ // tracer group cpunt ++ snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_GROUP_COUNT, i); ++ tmp = getenv(buf); ++ if (!tmp) ++ return -1; ++ kmsg_tracer_group_count = atoi(tmp); ++ if (kmsg_tracer_group_count < 0) ++ return -1; ++ kmsg_tracer_group_count++; ++ kmsg_tracer[i].group_count = kmsg_tracer_group_count; ++ kmsg_tracer[i].group_key = calloc(kmsg_tracer_group_count, sizeof(char *)); ++ if (!kmsg_tracer[i].group_key) ++ return -1; ++ ++ // tracer group key ++ snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_GROUP_KEY, i); ++ kmsg_tracer_group_key = strdup(getenv(buf)); ++ if (!kmsg_tracer_group_key || (strlen(kmsg_tracer_group_key) > BUF_LEN)) ++ return -1; ++ ++ c = 0; ++ token = strtok(kmsg_tracer_group_key, ","); ++ while (token) { ++ kmsg_tracer[i].group_key[c++] = strdup(token); ++ if (c >= kmsg_tracer_group_count) ++ break; ++ token = strtok(NULL, ","); ++ } ++ free(kmsg_tracer_group_key); ++ ++ ret = regcomp(&kmsg_tracer[i].regex_c, kmsg_tracer[i].regex, REG_EXTENDED); ++ if (ret) { ++ regerror(ret, &kmsg_tracer[i].regex_c, buf, sizeof(buf)); ++ log(ALL, LOG_ERR, "Regex execution error: %s\n", buf); ++ return ret; ++ } ++ ++ kmsg_tracer[i].matches = calloc(kmsg_tracer_group_count, sizeof(regmatch_t)); ++ if (!kmsg_tracer[i].matches) ++ return -1; ++ ++ if (!strcmp("xid", kmsg_tracer[i].name) || ++ !strcmp("sxid", kmsg_tracer[i].name) || ++ !strcmp("axid", kmsg_tracer[i].name)) { ++ char *s = getenv(KMSG_IGNORE_XID); ++ char *ignore; ++ char *xid_token; ++ ++ if (!s) ++ continue; ++ ++ ignore = strdup(s); ++ if (ignore) { ++ c = 0; ++ xid_token = strtok(ignore, ","); ++ while (xid_token) { ++ kmsg_tracer[i].info.xid.ignore_xid[c++] = atoi(xid_token); ++ if (c >= 30) { ++ free(ignore); ++ continue; ++ } ++ xid_token = strtok(NULL, ","); ++ } ++ kmsg_tracer[i].info.xid.len = c; ++ } ++ ++ free(ignore); ++ } ++ } ++ ++ setup_event_trigger("kmsg_monitor"); ++ ++ return 0; ++} +diff --git a/ras-kmsg.h b/ras-kmsg.h +new file mode 100644 +index 0000000..f31125f +--- /dev/null ++++ b/ras-kmsg.h +@@ -0,0 +1,47 @@ ++ ++#ifndef __RAS_KMSG_H ++#define __RAS_KMSG_H ++ ++#include ++ ++/** ++ * Kernel message tracer related definitions ++ */ ++#define KMSG_TRACE_NUM "KMSG_TRACE_NUM" ++#define KMSG_TRACER_NAME "KMSG_TRACER_NAME" ++#define KMSG_TRACER_REGEX "KMSG_TRACER_REGEX" ++#define KMSG_TRACER_GROUP_COUNT "KMSG_TRACER_GROUP_COUNT" ++#define KMSG_TRACER_GROUP_KEY "KMSG_TRACER_GROUP_KEY" ++ ++#define KMSG_TRACE_END "KMSG_TRACE_END" ++#define KMSG_IGNORE_XID "KMSG_IGNORE_XID" ++#define KMSG_LIMIT "KMSG_LIMIT" ++ ++#define NAME_LEN 64 ++#define BUF_LEN 1024 ++#define PRINTK_MESSAGE_MAX 2048 ++ ++extern int kmsg_monitor; ++extern int kmsg_trace_end; ++extern int kmsg_limit; ++ ++struct kmsg_tracer_info { ++ char *name; ++ char *regex; ++ int group_count; ++ char **group_key; ++ regex_t regex_c; ++ regmatch_t *matches; ++ union { ++ struct { ++ int len; ++ int ignore_xid[30]; ++ } xid; ++ } info; ++}; ++ ++int kmsg_tracer_init(void); ++int kmsg_tracer_destroy(void); ++int kmsg_match(char *msg); ++ ++#endif +diff --git a/ras-report-json.c b/ras-report-json.c +index b1c33a4..2d35355 100644 +--- a/ras-report-json.c ++++ b/ras-report-json.c +@@ -11,17 +11,17 @@ + * GNU General Public License for more details. + */ + ++#include + #include + #include ++#include + #include +-#include +-#include +-#include +-#include + #include + + #include "traceevent/event-parse.h" ++#include "ras-kmsg.h" + #include "ras-report.h" ++#include "ras-time.h" + + #define NONE "" + int json_report = 1; +@@ -236,3 +236,63 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) + ev->apicid); + } + ++#ifdef HAVE_KMSG_MONITOR ++void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg) ++{ ++ struct trace_seq seq; ++ int e, s; ++ int group_count = kmsg_tracer->group_count; ++ regmatch_t *matches = kmsg_tracer->matches; ++ char tmpbuf[256] = {0}, timestamp[64] = {0}; ++ char pci_name[128] = {0}, *key; ++ u16 vendor, device; ++ ++ get_kmsg_time(msg, timestamp); ++ ++ trace_seq_init(&seq); ++ trace_seq_printf(&seq, "\n{ \"%s\": \"%s\", ", JSON_REPORT_KEY, kmsg_tracer->name); ++ trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp); ++ ++ for (int j = 1; j < group_count; j++) { ++ s = matches[j].rm_so; ++ e = matches[j].rm_eo; ++ key = kmsg_tracer->group_key[j - 1]; ++ ++ if (s < 0) ++ trace_seq_printf(&seq, "\"%s\": \"\", ", key); ++ else ++ trace_seq_printf(&seq, "\"%s\": \"%.*s\"%s ", ++ key, ++ (int)(e - s), msg + s, ++ (j == group_count - 1) ? "" : ","); ++ ++ if (!strcmp("pci_port", kmsg_tracer->group_key[j - 1])) { ++ snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s); ++ get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device); ++ trace_seq_printf(&seq, "\"pci_dev_name\": \"%s\", ", pci_name); ++ trace_seq_printf(&seq, "\"vendor_id\": \"%#x\", ", vendor); ++ trace_seq_printf(&seq, "\"device_id\": \"%#x\", ", device); ++ } ++ ++ if (!strcmp("xid", key) || ++ !strcmp("sxid", key) || ++ !strcmp("axid", key)) { ++ int xid; ++ ++ snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s); ++ xid = (int)strtol(tmpbuf, NULL, 10); ++ for (int i = 0; i < kmsg_tracer->info.xid.len; i++) { ++ if (xid == kmsg_tracer->info.xid.ignore_xid[i]) ++ goto out; ++ } ++ } ++ } ++ ++ trace_seq_puts(&seq, "}"); ++ trace_seq_do_printf(&seq); ++ printf("\n"); ++out: ++ fflush(stdout); ++ trace_seq_destroy(&seq); ++} ++#endif +diff --git a/ras-report.h b/ras-report.h +index eeb25bb..0564992 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -13,6 +13,7 @@ + #include "ras-mc-handler.h" + #include "ras-record.h" + #include "types.h" ++#include "ras-kmsg.h" + + /* Maximal length of backtrace. */ + #define MAX_BACKTRACE_SIZE (1024 * 1024) +@@ -127,6 +128,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev); + void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev); + void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); ++void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg); + #endif + + #endif +diff --git a/ras-time.c b/ras-time.c +new file mode 100644 +index 0000000..320f1a1 +--- /dev/null ++++ b/ras-time.c +@@ -0,0 +1,103 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef HAVE_SYSINFO ++#include ++#endif ++ ++#include "ras-time.h" ++ ++struct timeval boot_time; ++time_t suspended_time; ++ ++int get_boot_time(struct timeval *boot_time) ++{ ++#ifdef CLOCK_BOOTTIME ++ struct timespec hires_uptime; ++ struct timeval lores_uptime; ++#endif ++ struct timeval now; ++#ifdef HAVE_SYSINFO ++ struct sysinfo info; ++#endif ++ ++ if (gettimeofday(&now, NULL) != 0) ++ return -errno; ++#ifdef CLOCK_BOOTTIME ++ if (clock_gettime(CLOCK_BOOTTIME, &hires_uptime) == 0) { ++ TIMESPEC_TO_TIMEVAL(&lores_uptime, &hires_uptime); ++ timersub(&now, &lores_uptime, boot_time); ++ return 0; ++ } ++#endif ++#ifdef HAVE_SYSINFO ++ /* fallback */ ++ if (sysinfo(&info) != 0) ++ return -errno; ++ ++ boot_time->tv_sec = now.tv_sec - info.uptime; ++ boot_time->tv_usec = 0; ++ return 0; ++#else ++ return -ENOSYS; ++#endif ++} ++ ++time_t get_suspended_time(void) ++{ ++#if defined(CLOCK_BOOTTIME) && defined(CLOCK_MONOTONIC) ++ struct timespec boot, mono; ++ ++ if (clock_gettime(CLOCK_BOOTTIME, &boot) == 0 && ++ clock_gettime(CLOCK_MONOTONIC, &mono) == 0) ++ return boot.tv_sec - mono.tv_sec; ++#endif ++ return 0; ++} ++ ++const char *skip_item(const char *begin, const char *end, const char *sep) ++{ ++ while (begin < end) { ++ int c = *begin++; ++ ++ if (c == '\0' || strchr(sep, c)) ++ break; ++ } ++ ++ return begin; ++} ++ ++void get_kmsg_time(const char *msg, char *timestamp) ++{ ++ const char *p = msg, *end; ++ char *nu = NULL; ++ uint64_t usec; ++ struct timeval tv = { 0 }; ++ time_t t; ++ struct tm *tm; ++ ++ end = msg + strlen(msg) - 1; ++ ++ p = skip_item(p, end, ","); ++ p = skip_item(p, end, ",;"); ++ ++ errno = 0; ++ usec = strtoumax(p, &nu, 10); ++ ++ if (!errno && nu && (*nu == ';' || *nu == ',')) { ++ tv.tv_usec = usec % 1000000; ++ tv.tv_sec = usec / 1000000; ++ t = boot_time.tv_sec + suspended_time + tv.tv_sec; ++ } else { ++ t = time(NULL); ++ } ++ tm = localtime(&t); ++ ++ strftime(timestamp, 64, "%Y-%m-%d %H:%M:%S %z", tm); ++} +diff --git a/ras-time.h b/ras-time.h +new file mode 100644 +index 0000000..5dabae8 +--- /dev/null ++++ b/ras-time.h +@@ -0,0 +1,27 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#ifndef RAS_TIME_H ++#define RAS_TIME_H ++ ++# ifdef CLOCK_MONOTONIC_RAW ++# define UL_CLOCK_MONOTONIC CLOCK_MONOTONIC_RAW ++# else ++# define UL_CLOCK_MONOTONIC CLOCK_MONOTONIC ++# endif ++ ++#include ++ ++extern struct timeval boot_time; ++extern time_t suspended_time; ++ ++int get_boot_time(struct timeval *boot_time); ++ ++time_t get_suspended_time(void); ++ ++int gettime_monotonic(struct timeval *tv); ++ ++const char *skip_item(const char *begin, const char *end, const char *sep); ++ ++void get_kmsg_time(const char *msg, char *timestamp); ++ ++#endif /* RAS_TIME_H */ +diff --git a/rasdaemon.c b/rasdaemon.c +index d5d2f85..30dcaf4 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -14,6 +14,8 @@ + #include "ras-erst.h" + #include "ras-events.h" + #include "ras-logger.h" ++#include "ras-kmsg.h" ++#include "ras-time.h" + #include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "ras-report.h" +@@ -154,6 +156,13 @@ int main(int argc, char *argv[]) + json_report = 0; + #endif + ++#ifdef HAVE_KMSG_MONITOR ++ if (choices_disable && ++ strlen(choices_disable) != 0 && ++ strstr(choices_disable, "kmsg_monitor")) ++ kmsg_monitor = 0; ++#endif ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +@@ -271,6 +280,11 @@ int main(int argc, char *argv[]) + log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n"); + } + #endif ++#ifdef HAVE_KMSG_MONITOR ++ get_boot_time(&boot_time); ++ suspended_time = get_suspended_time(); ++#endif ++ + handle_ras_events(args.record_events, args.enable_ipmitool); + + #ifdef HAVE_NVGPU +diff --git a/trigger.c b/trigger.c +index 7387113..d410137 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -99,6 +99,8 @@ struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; + struct event_trigger pre_page_offline_trigger = {"page_offline", "PRE_PAGE_OFFLINE_TRIGGER"}; + struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFFLINE_TRIGGER"}; + ++struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -117,6 +119,9 @@ static struct event_trigger *event_triggers[] = { + &pre_page_offline_trigger, + &post_page_offline_trigger, + #endif ++#ifdef HAVE_KMSG_MONITOR ++ &kmsg_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -421,3 +426,53 @@ void run_page_offline_trigger(unsigned long long addr, int otype, int type) + __run_page_offline_trigger(addr, otype, &pre_page_offline_trigger); + } + ++void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg) ++{ ++ char *env[MAX_ENV], *key; ++ int ei = 0; ++ int e, s; ++ int group_count = kmsg_tracer->group_count; ++ regmatch_t *matches = kmsg_tracer->matches; ++ struct event_trigger *trigger = &kmsg_trigger; ++ char tmpbuf[128]; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ ++ for (int j = 1; j < group_count; j++) { ++ s = matches[j].rm_so; ++ e = matches[j].rm_eo; ++ key = kmsg_tracer->group_key[j - 1]; ++ ++ if (s >= 0) ++ if (asprintf(&env[ei++], "%s=%.*s", ++ key, (int)(e - s), msg + s) < 0) ++ goto free; ++ ++ if (!strcmp("xid", key) || ++ !strcmp("sxid", key) || ++ !strcmp("axid", key)) { ++ int xid; ++ ++ snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s); ++ xid = (int)strtol(tmpbuf, NULL, 10); ++ for (int i = 0; i < kmsg_tracer->info.xid.len; i++) { ++ if (xid == kmsg_tracer->info.xid.ignore_xid[i]) ++ goto free; ++ } ++ } ++ } ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (int i = 0; i < ei; i++) ++ free(env[i]); ++} ++ +diff --git a/trigger.h b/trigger.h +index 74df3d3..b5a6c2c 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -4,6 +4,7 @@ + #define __TRIGGER_H__ + + #include "ras-record.h" ++#include "ras-kmsg.h" + + enum page_offline_trigger_type { + PRE, +@@ -27,5 +28,7 @@ void run_mce_record_trigger(struct mce_event *e); + void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); + void run_page_offline_trigger(unsigned long long addr, int otype, int type); ++void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++ + + #endif +-- +2.43.5 + diff --git a/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch new file mode 100644 index 0000000000000000000000000000000000000000..1919377ade51d0e1a4a70b8e0c728124e88e4e11 --- /dev/null +++ b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch @@ -0,0 +1,1164 @@ +From 29c769fa59e73a016aea891476caea98fbf3a27d Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 12 Dec 2024 09:37:06 +0800 +Subject: [PATCH 19/30] rasdaemon: erst: add erst-mce erst-dmesg + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 4 +- + configure.ac | 4 + + ras-erst-dmesg.c | 875 +++++++++++++++++++++++++++++++++++++++++++ + ras-erst.c | 18 +- + ras-erst.h | 7 + + ras-record.h | 1 + + ras-report-json.c | 29 +- + ras-report.h | 1 + + ras-signal-handler.c | 3 + + rasdaemon.c | 2 - + 10 files changed, 932 insertions(+), 12 deletions(-) + create mode 100644 ras-erst-dmesg.c + +diff --git a/Makefile.am b/Makefile.am +index 68b354b..da6ef46 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -142,7 +142,7 @@ if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c + endif + if WITH_ERST +- rasdaemon_SOURCES += ras-erst.c ++ rasdaemon_SOURCES += ras-erst.c ras-erst-dmesg.c + endif + + if WITH_NVGPU +@@ -152,7 +152,7 @@ ras-nvgpu-nvml.h: contrib/nvml.py + rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ +diff --git a/configure.ac b/configure.ac +index dfb7f02..68fcb75 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -287,12 +287,16 @@ AC_ARG_ENABLE([erst], + AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)])) + + AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_CHECK_LIB(z, inflate,[echo "found zlib"] , AC_MSG_ERROR([*** Unable to find zlib library]), ) ++ ZLIBS="-lz" + AC_DEFINE(HAVE_ERST,1,"have ERST") + AC_SUBST([WITH_ERST]) + ]) + AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) + ++AC_SUBST([ZLIBS]) ++ + AC_ARG_ENABLE([nvgpu], + AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events])) + +diff --git a/ras-erst-dmesg.c b/ras-erst-dmesg.c +new file mode 100644 +index 0000000..ce61a6a +--- /dev/null ++++ b/ras-erst-dmesg.c +@@ -0,0 +1,875 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++* Copyright (C) 2025 Alibaba Inc ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bitfield.h" ++#include "ras-events.h" ++#include "ras-erst.h" ++#include "ras-logger.h" ++#include "ras-mce-handler.h" ++#include "ras-record.h" ++#include "ras-report.h" ++#include "types.h" ++ ++struct apei_regex { ++ regex_t hdr; ++ regex_t severity; ++ regex_t error; ++ regex_t fru; ++ regex_t type; ++ ++ regex_t addr; ++ regex_t loc; ++ regex_t mem_type; ++ regex_t mem_status; ++ ++ regex_t port_type; ++ regex_t port; ++ regex_t id; ++ regex_t status; ++ regex_t aer_sev; ++ regex_t tlp_hdr; ++ ++ regex_t cpu_id; ++ ++ regex_t midr; ++ regex_t mpidr; ++}; ++ ++enum { ++ APEI_NONE, ++ APEI_CPU, ++ APEI_MEM, ++ APEI_PCIE, ++ APEI_ARM, ++}; ++ ++struct apei { ++ int id; ++ int sev; ++ int err_id; ++ char *fru; ++ int type; ++ time_t time; ++ union { ++ struct { ++ uint64_t addr; ++ char *loc; ++ char *status; ++ char *type; ++ } mem; ++ struct { ++ int port_type; ++ char *port; ++ char *vendor_id; ++ char *device_id; ++ char *status; ++ char *mask; ++ char *sev; ++ char *tlp_hdr; ++ } pcie; ++ struct { ++ char *cpu_id; ++ } cpu; ++ struct { ++ char *midr; ++ char *mpidr; ++ } arm; ++ }; ++}; ++ ++time_t last_reboot_time; ++ ++static void get_last_reboot_time(void) ++{ ++ struct utmp record; ++ int fd; ++ int reboots_found = 0; ++ time_t reboot_times; ++ ++ fd = open("/var/log/wtmp", O_RDONLY); ++ if (fd == -1) { ++ log(ALL, LOG_ERR, "Error opening wtmp file"); ++ return; ++ } ++ ++ if (lseek(fd, -1 * sizeof(struct utmp), SEEK_END) == -1) { ++ perror("Error seeking in wtmp file"); ++ close(fd); ++ return; ++ } ++ ++ while (reboots_found < LAST_REBOOT_INDEX) { ++ if (read(fd, &record, sizeof(struct utmp)) != sizeof(struct utmp)) { ++ perror("Error reading wtmp file"); ++ close(fd); ++ return; ++ } ++ ++ if (strncmp(record.ut_line, "~", 1) == 0) { ++ if (strncmp(record.ut_user, "reboot", 6) == 0) { ++ reboot_times = record.ut_tv.tv_sec; ++ reboots_found++; ++ } ++ } ++ ++ if (lseek(fd, -2 * sizeof(struct utmp), SEEK_CUR) == -1) { ++ reboot_times = 0; ++ break; ++ } ++ } ++ ++ close(fd); ++ ++ last_reboot_time = reboot_times; ++ ++ return; ++} ++ ++#define DMESG_ERST_PREFIX "dmesg-erst" ++#define DMESG_ERST_SUFFIX "enc.z" ++ ++#define APEI_HEADER ".*\\[(.*).[0-9]+\\] \\{([0-9]+)\\}\\[Hardware Error\\]: Hardware error from APEI Generic Hardware Error Source:.*" ++#define APEI_SEVERITY ".*\\{([0-9]+)\\}\\[Hardware Error\\]: event severity: (.*)" ++#define APEI_ERROR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: Error ([0-9]+), type: (.*)" ++#define APEI_MEM_FRU ".*\\{([0-9]+)\\}\\[Hardware Error\\]: fru_text: (.*)" ++#define APEI_TYPE ".*\\{([0-9]+)\\}\\[Hardware Error\\]: section_type: (.*)" ++ ++// MEM ++#define APEI_MEM_ADDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: physical_address: (.*)" ++#define APEI_MEM_LOC ".*\\{([0-9]+)\\}\\[Hardware Error\\]: (node:.*)" ++#define APEI_MEM_TYPE ".*\\{([0-9]+)\\}\\[Hardware Error\\]: error_type: [0-9]+, (.*)" ++#define APEI_MEM_STATUS ".*\\{([0-9]+)\\}\\[Hardware Error\\]:.*error_status: (.*) \\(.*\\)" ++ ++// PCIE ++#define APEI_PORT_TYPE ".*\\{([0-9]+)\\}\\[Hardware Error\\]: port_type: ([0-9]+), (.*)" ++#define APEI_PORT ".*\\{([0-9]+)\\}\\[Hardware Error\\]: device_id: (.*)" ++#define APEI_ID ".*\\{([0-9]+)\\}\\[Hardware Error\\]: vendor_id: (.*), device_id: (.*)" ++#define APEI_STATUS ".*\\{([0-9]+)\\}\\[Hardware Error\\]: aer_uncor_status: (.*), aer_uncor_mask: (.*)" ++#define APEI_AER_SEVE ".*\\{([0-9]+)\\}\\[Hardware Error\\]: aer_uncor_severity: (.*)" ++#define APEI_TLP_HDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: TLP Header: (.*)" ++ ++#define APEI_CPU_ID ".*\\{([0-9]+)\\}\\[Hardware Error\\]: processor_id: (.*)" ++ ++#define APEI_ARM_MIDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: MIDR: (.*)" ++#define APEI_ARM_MPIDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: Multiprocessor Affinity Register \\(MPIDR\\): (.*)" ++ ++static int decompress_deflate(const char *compressed_data, ssize_t compressed_data_size, ++ char *decompressed_data, ssize_t *decompressed_data_size, z_stream *zstream) ++{ ++ int ret = Z_OK; ++ ++ ret = inflateReset2(zstream, -MAX_WBITS); ++ if (ret != Z_OK) ++ return ret; ++ ++ zstream->next_in = (Bytef *)compressed_data; ++ zstream->avail_in = compressed_data_size; ++ zstream->next_out = (Bytef *)decompressed_data; ++ zstream->avail_out = *decompressed_data_size; ++ ++ ret = inflate(zstream, Z_FINISH); ++ if (ret != Z_STREAM_END) ++ return Z_DATA_ERROR; ++ ++ *decompressed_data_size = zstream->total_out; ++ ++ return ret; ++} ++ ++static void apei_report_mem(struct trace_seq *s, struct apei *apei) ++{ ++ struct ras_mc_event ev = {0}; ++ char msg_buf[400]; ++ time_t t; ++ struct tm *tm; ++ ++ ev.erst = 1; ++ if (!apei->time) ++ t = time(NULL); ++ else ++ t = apei->time; ++ ++ tm = localtime(&t); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ ev.error_count = 1; ++ ev.grain = 1; ++ ev.top_layer = -1; ++ ev.middle_layer = -1; ++ ev.lower_layer = -1; ++ ++ switch (apei->sev) { ++ case GHES_SEV_CORRECTED: ++ ev.error_type = "Corrected"; ++ break; ++ case GHES_SEV_RECOVERABLE: ++ ev.error_type = "Uncorrected"; ++ break; ++ case GHES_SEV_PANIC: ++ ev.error_type = "Fatal"; ++ break; ++ default: ++ ev.error_type = "Info"; ++ } ++ ev.severity = apei->sev; ++ ++ snprintf(msg_buf, 400, "APEI location: %s status(0x00000000): %s", ++ apei->mem.loc, ++ apei->mem.status ? apei->mem.status : ""); ++ ev.driver_detail = msg_buf; ++ ++ ev.address = apei->mem.addr; ++ ev.mc_index = 0; ++ ++#ifdef HAVE_JSON_REPORT ++ report_mc_event_json(s, &ev); ++#endif ++} ++ ++/* bit field meaning for correctable error */ ++static const char *aer_cor_errors[32] = { ++ /* Correctable errors */ ++ [0] = "Receiver Error", ++ [6] = "Bad TLP", ++ [7] = "Bad DLLP", ++ [8] = "RELAY_NUM Rollover", ++ [12] = "Replay Timer Timeout", ++ [13] = "Advisory Non-Fatal", ++ [14] = "Corrected Internal Error", ++}; ++ ++/* bit field meaning for uncorrectable error */ ++static const char *aer_uncor_errors[32] = { ++ /* Uncorrectable errors */ ++ [4] = "Data Link Protocol", ++ [12] = "Poisoned TLP", ++ [13] = "Flow Control Protocol", ++ [14] = "Completion Timeout", ++ [15] = "Completer Abort", ++ [16] = "Unexpected Completion", ++ [17] = "Receiver Overflow", ++ [18] = "Malformed TLP", ++ [19] = "ECRC", ++ [20] = "Unsupported Request", ++}; ++ ++static void apei_report_pcie(struct trace_seq *s, struct apei *apei) ++{ ++ struct ras_aer_event ev = {0}; ++ unsigned long long status_val; ++ char buf[1024]; ++ time_t t; ++ struct tm *tm; ++ ++ ev.erst = 1; ++ if (!apei->time) ++ t = time(NULL); ++ else ++ t = apei->time; ++ ++ tm = localtime(&t); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ ev.dev_name = apei->pcie.port; ++ ev.vendor_id = strtoul(apei->pcie.vendor_id, NULL, 16); ++ ev.device_id = strtoul(apei->pcie.device_id, NULL, 16); ++ ++ if (apei->pcie.status) { ++ status_val = strtoull(apei->pcie.status, NULL, 16); ++ ++ if (apei->sev == GHES_SEV_CORRECTED) ++ bitfield_msg(buf, sizeof(buf), aer_cor_errors, 32, 0, 0, status_val); ++ else ++ bitfield_msg(buf, sizeof(buf), aer_uncor_errors, 32, 0, 0, status_val); ++ } else { ++ snprintf(buf, 1024, "no status"); ++ } ++ ev.msg = buf; ++ ++ ev.tlp_header_valid = (apei->pcie.tlp_hdr != NULL); ++ if (ev.tlp_header_valid) ++ snprintf((buf + strlen(ev.msg)), 1024 - strlen(ev.msg), ++ " TLP Header: %s", apei->pcie.tlp_hdr); ++ ++ ev.severity = apei->sev; ++ switch (apei->sev) { ++ case GHES_SEV_RECOVERABLE: ++ ev.error_type = "Uncorrected (Non-Fatal)"; ++ break; ++ case GHES_SEV_PANIC: ++ ev.error_type = "Uncorrected (Fatal)"; ++ break; ++ case GHES_SEV_CORRECTED: ++ ev.error_type = "Corrected"; ++ break; ++ default: ++ ev.error_type = "Unknown severity"; ++ } ++ ++#ifdef HAVE_JSON_REPORT ++ report_aer_event_json(s, &ev); ++#endif ++} ++ ++static void report_apei(struct apei *apei) ++{ ++ struct trace_seq seq; ++ time_t t; ++ struct tm *tm; ++ char timestamp[64]; ++ ++ if (!apei->type) ++ return; ++ ++ trace_seq_init(&seq); ++ //trace_seq_printf(&seq, "{ \"event_name\": \"%s\", ", ERST_PANIC_NAME); ++ ++ switch (apei->type) { ++ case APEI_MEM: ++ apei_report_mem(&seq, apei); ++ break; ++ case APEI_PCIE: ++ apei_report_pcie(&seq, apei); ++ break; ++ case APEI_CPU: ++ if (!apei->time) ++ t = time(NULL); ++ else ++ t = apei->time; ++ ++ tm = localtime(&t); ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ trace_seq_printf(&seq, "{ \"%s\": \"%s\", ", JSON_REPORT_KEY, "erst_cpu"); ++ trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp); ++ trace_seq_printf(&seq, "\"fru\": \"%s\", ", apei->fru ? apei->fru : ""); ++ trace_seq_printf(&seq, "\"severity\": \"%s\", ", severity_strs[apei->sev]); ++ trace_seq_printf(&seq, "\"cpu_id\": \"%s\" ", apei->cpu.cpu_id ? apei->cpu.cpu_id : ""); ++ trace_seq_puts(&seq, "}"); ++ break; ++ case APEI_ARM: ++ if (!apei->time) ++ t = time(NULL); ++ else ++ t = apei->time; ++ ++ tm = localtime(&t); ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ trace_seq_printf(&seq, "{ \"%s\": \"%s\", ", JSON_REPORT_KEY, "erst_arm_cpu"); ++ trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp); ++ trace_seq_printf(&seq, "\"fru\": \"%s\", ", apei->fru ? apei->fru : ""); ++ trace_seq_printf(&seq, "\"severity\": \"%s\", ", severity_strs[apei->sev]); ++ trace_seq_printf(&seq, "\"midr\": \"%s\" ", apei->arm.midr ? apei->arm.midr : ""); ++ trace_seq_printf(&seq, "\"mpidr\": \"%s\" ", apei->arm.mpidr ? apei->arm.mpidr : ""); ++ trace_seq_puts(&seq, "}"); ++ break; ++ } ++ ++ //trace_seq_puts(&seq, "}"); ++ trace_seq_do_printf(&seq); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&seq); ++ ++ memset(apei, 0, sizeof(*apei)); ++ apei->err_id = -1; ++} ++ ++static int is_compressed_file(const char *name) ++{ ++ char buf[32]; ++ ++ snprintf(buf, sizeof(buf), "%s", name + strlen(name) - strlen(DMESG_ERST_SUFFIX)); ++ return strncmp(buf, DMESG_ERST_SUFFIX, sizeof(DMESG_ERST_SUFFIX)) == 0; ++} ++ ++static int line_is_panic_part1(char *line) ++{ ++ int count, part; ++ ++ if (sscanf(line, "Panic#%d Part%u", &count, &part) != 2) ++ return 0; ++ ++ return part == 1; ++} ++ ++static int compressed_file_is_panic_part1(char *buf, const char *name, z_stream *zstream) ++{ ++ ssize_t out_size = 0; ++ char out_buf[128], *line; ++ ++ if (decompress_deflate(buf, strlen(buf), out_buf, &out_size, zstream)) ++ return 0; ++ ++ line = strtok(out_buf, "\n"); ++ ++ return line_is_panic_part1(line); ++} ++ ++static int file_is_panic_part1(FILE *file, const char *name, z_stream *zstream) ++{ ++ char line[32]; ++ ++ if (!fgets(line, 32, file)) ++ return 0; ++ ++ if (is_compressed_file(name)) ++ return compressed_file_is_panic_part1(line, name, zstream); ++ ++ return line_is_panic_part1(line); ++ ++} ++ ++static void regex_group(regmatch_t *m, int i, const char *line, char *buf) ++{ ++ int e, s; ++ ++ s = m[i].rm_so; ++ e = m[i].rm_eo; ++ if (s >= 0) ++ snprintf(buf, e - s + 1, "%s", line + s); ++ else ++ buf = NULL; ++} ++ ++static int dmesg_erst_line_process(const char *line, struct apei_regex *regex, struct apei *apei) ++{ ++ int ret, err_id = 0, apei_id = 0; ++ regmatch_t matches[4]; ++ char buf[128]; ++ regex_t *re; ++ time_t t; ++ ++ ret = regexec(re = ®ex->hdr, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei_id = atoi(buf); ++ ++ if (apei->id && apei_id != apei->id) ++ report_apei(apei); ++ apei->id = apei_id; ++ ++ regex_group(matches, 1, line, buf); ++ t = atoll(buf); ++ ++ if (last_reboot_time) ++ apei->time = last_reboot_time + t; ++ else ++ apei->time = 0; ++ ++ return 0; ++ } ++ ++ ret = regexec(re = ®ex->error, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ err_id = atoi(buf); ++ ++ if (apei->err_id != -1 && err_id != apei->err_id) ++ report_apei(apei); ++ ++ apei->err_id = err_id; ++ ++ regex_group(matches, 3, line, buf); ++ if (!strcmp("corrected", buf)) ++ apei->sev = GHES_SEV_CORRECTED; ++ else if (!strcmp("recoverable", buf)) ++ apei->sev = GHES_SEV_RECOVERABLE; ++ else if (!strcmp("fatal", buf)) ++ apei->sev = GHES_SEV_PANIC; ++ else ++ apei->sev = GHES_SEV_NO; ++ return 0; ++ } ++ ++ if (!apei->type) { ++ ret = regexec(re = ®ex->type, line, 4, matches, 0); ++ if (ret) ++ goto error; ++ ++ regex_group(matches, 2, line, buf); ++ if (!strcmp("general processor error", buf)) ++ apei->type = APEI_CPU; ++ else if (!strcmp("memory error", buf)) ++ apei->type = APEI_MEM; ++ else if (!strcmp("PCIe error", buf)) ++ apei->type = APEI_PCIE; ++ else if (!strcmp("ARM processor error", buf)) ++ apei->type = APEI_ARM; ++ else ++ apei->type = APEI_NONE; ++ ++ return 0; ++ } ++ ++ switch (apei->type) { ++ case APEI_CPU: ++ ret = regexec(re = ®ex->cpu_id, line, 4, matches, 0); ++ if (ret) ++ goto error; ++ regex_group(matches, 2, line, buf); ++ apei->cpu.cpu_id = strdup(buf); ++ ++ return 0; ++ case APEI_ARM: ++ if (!apei->arm.midr) { ++ ret = regexec(re = ®ex->midr, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->arm.midr = strdup(buf); ++ ++ return 0; ++ } ++ } ++ ++ if (!apei->arm.mpidr) { ++ ret = regexec(re = ®ex->mpidr, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->arm.mpidr = strdup(buf); ++ ++ return 0; ++ } ++ } ++ ++ return 0; ++ case APEI_MEM: ++ if (!apei->mem.addr) { ++ ret = regexec(re = ®ex->addr, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->mem.addr = strtoull(buf, NULL, 16); ++ return 0; ++ } ++ } ++ ++ if (!apei->mem.loc) { ++ ret = regexec(re = ®ex->loc, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->mem.loc = strdup(buf); ++ return 0; ++ } ++ } ++ ++ if (!apei->mem.type) { ++ ret = regexec(re = ®ex->mem_type, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->mem.type = strdup(buf); ++ return 0; ++ } ++ } ++ ++ if (!apei->mem.status) { ++ ret = regexec(re = ®ex->mem_status, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->mem.status = strdup(buf); ++ return 0; ++ } ++ } ++ ++ case APEI_PCIE: ++ //port type ++ ret = regexec(re = ®ex->port_type, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.port_type = atoi(buf); ++ ++ return 0; ++ } ++ ++ // port ++ if (!apei->pcie.port) { ++ ret = regexec(re = ®ex->port, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.port = strdup(buf); ++ return 0; ++ } ++ } ++ ++ // vendor id device id ++ if (!apei->pcie.vendor_id) { ++ ret = regexec(®ex->id, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.vendor_id = strdup(buf); ++ regex_group(matches, 3, line, buf); ++ apei->pcie.device_id = strdup(buf); ++ ++ return 0; ++ } ++ } ++ ++ // status ++ if (!apei->pcie.status) { ++ ret = regexec(re = ®ex->status, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.status = strdup(buf); ++ regex_group(matches, 3, line, buf); ++ apei->pcie.mask = strdup(buf); ++ ++ return 0; ++ } ++ } ++ ++ // aer sev ++ if (!apei->pcie.sev) { ++ ret = regexec(re = ®ex->aer_sev, line, 4, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.sev = strdup(buf); ++ ++ return 0; ++ } ++ } ++ ++ // tlp hdr ++ if (!apei->pcie.tlp_hdr) { ++ ret = regexec(re = ®ex->tlp_hdr, line, 4, matches, 0); ++ if (ret) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, line, buf); ++ apei->pcie.tlp_hdr = strdup(buf); ++ ++ return 0; ++ } ++ } ++ } ++ ++error: ++ if (ret == REG_NOMATCH) ++ return 0; ++ regerror(ret, re, buf, sizeof(buf)); ++ printf("Regex execution error: %s\n", buf); ++ return ret; ++} ++ ++static int handle_erst_dmesg(FILE *file, const char *name, z_stream *zstream, struct apei_regex *regex) ++{ ++ long fileSize; ++ char *file_buf, *line, *out_data = NULL; ++ ssize_t out_max_size, out_data_size = 0, bytesRead; ++ int ret = 0, line_number = 1; ++ struct apei apei = {0}; ++ ++ apei.err_id = -1; ++ ++ if (!file_is_panic_part1(file, name, zstream)) ++ return -1; ++ ++ if (fseek(file, 0, SEEK_END) != 0) ++ return -1; ++ ++ fileSize = ftell(file); ++ if (fileSize == -1) ++ return -1; ++ ++ file_buf = (char *)malloc(fileSize + 1); ++ if (!file_buf) ++ return -1; ++ ++ rewind(file); ++ bytesRead = fread(file_buf, 1, fileSize, file); ++ if (bytesRead != fileSize) { ++ ret = -1; ++ goto free_file; ++ } ++ file_buf[fileSize] = '\0'; ++ ++ if (is_compressed_file(name)) { ++ out_max_size = fileSize * 3; ++ out_data = (char *)malloc(out_max_size); ++ if (!out_data) { ++ ret = -1; ++ goto free_file; ++ } ++ ++ ret = decompress_deflate(file_buf, fileSize, out_data, &out_data_size, zstream); ++ if (ret) ++ goto free_out; ++ ++ file_buf = out_data; ++ } ++ ++ line = strtok(file_buf, "\n"); ++ ++ while (line) { ++ dmesg_erst_line_process(line, regex, &apei); ++ ++ line = strtok(NULL, "\n"); ++ line_number++; ++ } ++ ++ report_apei(&apei); ++ ++free_out: ++ if (out_data) ++ free(out_data); ++free_file: ++ free(file_buf); ++ ++ return ret; ++} ++ ++static int init_reg(regex_t *re, const char *str) ++{ ++ char buf[128]; ++ int ret = 0; ++ ++ ret = regcomp(re, str, REG_EXTENDED); ++ if (ret) { ++ regerror(ret, re, buf, sizeof(buf)); ++ printf("Regex execution error: %s\n", buf); ++ return ret; ++ } ++ ++ return ret; ++} ++ ++static void handle_erst_dmesg_file(const char *dir_name, const char *d_name, z_stream *zstream, struct apei_regex *regex) ++{ ++ char file_path[512]; ++ FILE *file; ++ ++ if (strncmp(d_name, DMESG_ERST_PREFIX, strlen(DMESG_ERST_PREFIX))) ++ return; ++ ++ snprintf(file_path, sizeof(file_path), "%s/%s", dir_name, d_name); ++ ++ file = fopen(file_path, "r"); ++ if (!file) { ++ log(ALL, LOG_INFO, "Failed to open file %s\n", file_path); ++ return; ++ } ++ ++ handle_erst_dmesg(file, file_path, zstream, regex); ++ ++ fclose(file); ++ ++ if (erst_delete && unlink(file_path)) { ++ log(ALL, LOG_INFO, "Error deleting file %s\n", file_path); ++ return; ++ } ++} ++ ++void handle_erst_panic(void) ++{ ++ z_stream zstream = { 0 }; ++ int rc = 0; ++ struct dirent *entry; ++ struct apei_regex regex; ++ ++ if (!last_reboot_time) ++ get_last_reboot_time(); ++ ++ if (init_reg(®ex.hdr, APEI_HEADER) || ++ init_reg(®ex.severity, APEI_SEVERITY) || ++ init_reg(®ex.error, APEI_ERROR) || ++ init_reg(®ex.fru, APEI_MEM_FRU) || ++ init_reg(®ex.type, APEI_TYPE) || ++ init_reg(®ex.addr, APEI_MEM_ADDR) || ++ init_reg(®ex.loc, APEI_MEM_LOC) || ++ init_reg(®ex.mem_type, APEI_MEM_TYPE) || ++ init_reg(®ex.mem_status, APEI_MEM_STATUS) || ++ init_reg(®ex.port_type, APEI_PORT_TYPE) || ++ init_reg(®ex.port, APEI_PORT) || ++ init_reg(®ex.id, APEI_ID) || ++ init_reg(®ex.status, APEI_STATUS) || ++ init_reg(®ex.aer_sev, APEI_AER_SEVE) || ++ init_reg(®ex.tlp_hdr, APEI_TLP_HDR) || ++ init_reg(®ex.cpu_id, APEI_CPU_ID) || ++ init_reg(®ex.midr, APEI_ARM_MIDR) || ++ init_reg(®ex.mpidr, APEI_ARM_MPIDR)) ++ return; ++ ++ DIR *dir = opendir(ERST_PATH); ++ ++ if (!dir) { ++ log(ALL, LOG_INFO, "%s Failed to open directory %s\n", ERST_PATH, strerror(errno)); ++ return; ++ } ++ ++ inflateInit2(&zstream, -MAX_WBITS); ++ if (rc != Z_OK) { ++ log(ALL, LOG_INFO, "Failed to open init inflate %d\n", rc); ++ return; ++ } ++ ++ while ((entry = readdir(dir)) != NULL) { ++ struct stat path_stat; ++ char file_path[MAX_PATH]; ++ ++ snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name); ++ stat(file_path, &path_stat); ++ ++ if (S_ISDIR(path_stat.st_mode) && !strncmp("erst", entry->d_name, sizeof("erst"))) { ++ DIR *subdir = opendir(file_path); ++ struct dirent *subentry; ++ ++ if (!subdir) { ++ log(ALL, LOG_INFO, "Failed to open directory %s\n", strerror(errno)); ++ break; ++ } ++ while ((subentry = readdir(subdir)) != NULL) ++ handle_erst_dmesg_file(file_path, subentry->d_name, &zstream, ®ex); ++ ++ closedir(subdir); ++ ++ } else ++ handle_erst_dmesg_file(ERST_PATH, entry->d_name, &zstream, ®ex); ++ } ++ ++ closedir(dir); ++ ++ inflateEnd(&zstream); ++} +diff --git a/ras-erst.c b/ras-erst.c +index c024d60..a0ece1b 100644 +--- a/ras-erst.c ++++ b/ras-erst.c +@@ -14,6 +14,8 @@ + #include "ras-logger.h" + #include "ras-mce-handler.h" + #include "ras-record.h" ++#include "ras-report.h" ++#include "ras-time.h" + #include "types.h" + + struct mce { +@@ -43,11 +45,7 @@ struct mce { + uint32_t microcode; /* Microcode revision */ + }; + +-static int erst_delete; +- +-#define ERST_PATH "/sys/fs/pstore/erst" +-#define MCE_ERST_PREFIX "mce-erst" +-#define ERST_EVENT_NAME "mce_erst_record" ++int erst_delete; + + #ifdef HAVE_MCE + static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) +@@ -80,6 +78,9 @@ static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) + "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME); + + report_mce_event(ras, NULL, &s, e); ++#ifdef HAVE_JSON_REPORT ++ report_mce_event_json(&s, e); ++#endif + trace_seq_terminate(&s); + trace_seq_do_printf(&s); + printf("\n"); +@@ -188,8 +189,15 @@ static void handle_erst_mce(void) + /* ERST just support mce now */ + void handle_erst(void) + { ++ get_boot_time(&boot_time); ++ suspended_time = get_suspended_time(); ++ + if (getenv(ERST_DELETE)) + erst_delete = atoi(getenv(ERST_DELETE)); + ++#ifdef HAVE_MCE + handle_erst_mce(); ++#endif ++ ++ handle_erst_panic(); + } +diff --git a/ras-erst.h b/ras-erst.h +index 83d7535..29a5587 100644 +--- a/ras-erst.h ++++ b/ras-erst.h +@@ -8,10 +8,17 @@ + #define __RAS_ERST_H + + #define ERST_DELETE "ERST_DELETE" ++#define ERST_PATH "/sys/fs/pstore/erst" ++#define MCE_ERST_PREFIX "mce-erst" ++#define ERST_EVENT_NAME "mce_erst_record" ++#define ERST_PANIC_NAME "dmesg_erst_record" ++#define LAST_REBOOT_INDEX 2 + ++extern int erst_delete; + #ifdef HAVE_MCE + void handle_erst_mce(void); + #endif + + void handle_erst(void); ++void handle_erst_panic(void); + #endif +diff --git a/ras-record.h b/ras-record.h +index 7f49b74..416f679 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -101,6 +101,7 @@ struct ras_arm_event { + uint64_t error_info; + uint64_t virt_fault_addr; + uint64_t phy_fault_addr; ++ int erst; + }; + + struct devlink_event { +diff --git a/ras-report-json.c b/ras-report-json.c +index 2d35355..e28cfac 100644 +--- a/ras-report-json.c ++++ b/ras-report-json.c +@@ -45,6 +45,7 @@ void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev) + "\"syndrome\": \"%#llx\", " + "\"driver_detail\": \"%s\" }", + JSON_REPORT_KEY, ++ ev->erst ? "erst_mc_event" : "mc_event", + (*ev->timestamp) ? ev->timestamp : NONE, + severity_strs[ev->severity], + ev->error_count, +@@ -114,7 +115,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) + get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device); + + trace_seq_printf(s, +- "\n{ \"%s\": \"aer_event\", " \ ++ "\n{ \"%s\": \"%s\", " \ + "\"timestamp\": \"%s\", " \ + "\"severity\": \"%s\", " \ + "\"error_type\": \"%s\", " \ +@@ -124,12 +125,14 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) + "\"device_id\": \"%#x\", " \ + "\"msg\": \"%s\" }", + JSON_REPORT_KEY, ++ ev->erst ? "erst_aer_event" : "aer_event", + (*ev->timestamp) ? ev->timestamp : NONE, + severity_strs[ev->severity], + (ev->error_type) ? ev->error_type : NONE, + (ev->dev_name) ? ev->dev_name : NONE, + (*pci_name) ? pci_name : NONE, +- vendor, device, ++ ev->vendor_id ? ev->vendor_id : vendor, ++ ev->device_id ? ev->device_id : device, + (ev->msg) ? ev->msg : NONE); + } + +@@ -139,7 +142,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) + return; + + trace_seq_printf(s, +- "\n{ \"%s\": \"arm_event\", " \ ++ "\n{ \"%s\": \"%s\", " \ + "\"timestamp\": \"%s\", " \ + "\"error_count\": %d, " \ + "\"affinity\": %d, " \ +@@ -148,6 +151,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) + "\"running_state\": %d, " \ + "\"psci_state\": %d }", + JSON_REPORT_KEY, ++ ev->erst ? "erst_arm_event" : "arm_event", + (*ev->timestamp) ? ev->timestamp : NONE, + ev->error_count, + ev->affinity, +@@ -173,6 +177,24 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev) + (ev->action_result) ? ev->action_result : NONE); + } + ++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"signal_event\", \"timestamp\": \"%s\", " \ ++ "\"signo\": %d, \"sigerr\": %d, " \ ++ "\"sigcode\": %d, \"comm\": \"%s\", " \ ++ "\"pid\": %d, \"group\": %d, " \ ++ "\"result\": %d }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ ev->sig, ev->error_no, ev->code, ++ (ev->comm) ? ev->comm : NONE, ++ ev->pid, ev->group, ev->result); ++} ++ + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) + { + if (!s || !ev || !json_report) +@@ -211,6 +233,7 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) + "\"mcgcap\": \"%#lx\", " + "\"apicid\": \"%#x\" }", + JSON_REPORT_KEY, ++ ev->erst ? "erst_mce_record" : "mce_record", + (*ev->timestamp) ? ev->timestamp : NONE, + severity_strs[ev->severity], + ev->bank, +diff --git a/ras-report.h b/ras-report.h +index 0564992..7f7f304 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -129,6 +129,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev); + void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); + void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev); + #endif + + #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index d15c4f6..0d999a6 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -130,6 +130,9 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, + + report_ras_signal_event(s, &ev); + ++#ifdef HAVE_JSON_REPORT ++ report_signal_event_json(s, &ev); ++#endif + /* Store data into the SQLite DB */ + #ifdef HAVE_SQLITE3 + ras_store_signal_event(ras, &ev); +diff --git a/rasdaemon.c b/rasdaemon.c +index 30dcaf4..335c047 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -247,13 +247,11 @@ int main(int argc, char *argv[]) + exit(EXIT_FAILURE); + + #ifdef HAVE_ERST +-#ifdef HAVE_MCE + if (choices_disable && strlen(choices_disable) != 0 && + strstr(choices_disable, "ras:erst")) + log(ALL, LOG_INFO, "Disabled ras:erst from config\n"); + else + handle_erst(); +-#endif + #endif + if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE))) + config_pcie_edpc(); +-- +2.43.5 + diff --git a/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch new file mode 100644 index 0000000000000000000000000000000000000000..26de833c4c534b61ca8a37449dd805720f22228d --- /dev/null +++ b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch @@ -0,0 +1,484 @@ +From e58b2e2c034ecfd6de044d8daee6d66a18b1ea3c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 17 Dec 2024 09:36:55 +0800 +Subject: [PATCH 20/30] anolis: rasdaemon: add amdgpu ras error monitor + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 2 +- + misc/rasdaemon.env | 1 + + ras-events.c | 1 + + ras-kmsg-amdgpu.c | 219 +++++++++++++++++++++++++++++++++++++++++++++ + ras-kmsg.c | 4 + + ras-kmsg.h | 25 ++++++ + ras-mce-handler.c | 3 + + ras-record.h | 3 + + ras-report-json.c | 81 +++++++++++++++++ + ras-report.h | 3 + + 10 files changed, 341 insertions(+), 1 deletion(-) + create mode 100644 ras-kmsg-amdgpu.c + +diff --git a/Makefile.am b/Makefile.am +index da6ef46..328fa49 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -135,7 +135,7 @@ if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c + endif + if WITH_KMSG_MONITOR +- rasdaemon_SOURCES += ras-kmsg.c ras-time.c ++ rasdaemon_SOURCES += ras-kmsg.c ras-time.c ras-kmsg-amdgpu.c + endif + + if WITH_POISON_PAGE_STAT +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index f498e24..2816505 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -131,6 +131,7 @@ MC_CE_STAT_THRESHOLD=2000 + POISON_STAT_THRESHOLD=102400 + + ERST_DELETE=1 ++AMDGPU_MCA_ENABLED=0 + + # EDPC config + # +diff --git a/ras-events.c b/ras-events.c +index d40f29e..88c83df 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -624,6 +624,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + } else if (size > 0) { + kmsg_buf[size] = '\0'; + kmsg_match(kmsg_buf); ++ amdgpu_tracer_match(kmsg_buf); + memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX); + } else { + count_nready++; +diff --git a/ras-kmsg-amdgpu.c b/ras-kmsg-amdgpu.c +new file mode 100644 +index 0000000..c46525a +--- /dev/null ++++ b/ras-kmsg-amdgpu.c +@@ -0,0 +1,219 @@ ++#include "ras-time.h" ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++#include "ras-kmsg.h" ++#include "trigger.h" ++ ++#define AMDGPU_ERROR_HEADER ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: Accelerator Check Architecture events logged\n" ++#define AMDGPU_ERROR_STATUS ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].STATUS=(0x[0-9A-Fa-f]+)\n" ++#define AMDGPU_ERROR_ADDR ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].ADDR=(0x[0-9A-Fa-f]+)\n" ++#define AMDGPU_ERROR_MISC0 ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].MISC0=(0x[0-9A-Fa-f]+)\n" ++#define AMDGPU_ERROR_IPID ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].IPID=(0x[0-9A-Fa-f]+)\n" ++#define AMDGPU_ERROR_SYND ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].SYND=(0x[0-9A-Fa-f]+)\n" ++ ++#define AMDGPU_MCA_ENABLED "AMDGPU_MCA_ENABLED" ++ ++static struct amdgpu_tracer *amdgpu_tracer; ++static struct amdgpu_error *amdgpu_error; ++static int amdgpu_mca_enable; ++ ++static void report_amdgpu_mca(struct amdgpu_error *e) ++{ ++#ifdef HAVE_MCE ++ struct ras_mc_offline_event event; ++ ++ event.smca = true; ++ event.family = 0x17; ++ event.model = 0x17; ++ event.bank = 1; ++ event.status = e->status; ++ event.synd = e->synd; ++ event.ipid = e->ipid; ++ event.addr = e->addr; ++ event.misc0 = e->misc0; ++ event.domain = e->seq; ++ event.bus = e->bus; ++ event.device = e->dev; ++ event.function = e->func; ++ ++ ras_offline_mce_event(&event); ++#endif ++} ++ ++static void report_amdgpu_error(struct amdgpu_error *e) ++{ ++ if (amdgpu_mca_enable && e->ipid && e->status) ++ report_amdgpu_mca(e); ++ else ++ report_amdgpu_error_json(e); ++} ++ ++static void regex_group(regmatch_t *m, int i, const char *line, char *buf) ++{ ++ int e, s; ++ ++ s = m[i].rm_so; ++ e = m[i].rm_eo; ++ if (s >= 0) ++ snprintf(buf, e - s + 1, "%s", line + s); ++ else ++ buf = NULL; ++} ++ ++void amdgpu_tracer_match(char *msg) ++{ ++ regmatch_t matches[10]; ++ regex_t *re; ++ char buf[128]; ++ int ret; ++ ++ ret = regexec(re = &(amdgpu_tracer->header), msg, 2, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ if (amdgpu_error->tracing) { ++ report_amdgpu_error(amdgpu_error); ++ } ++ ++ memset(amdgpu_error, 0, sizeof(*amdgpu_error)); ++ amdgpu_error->tracing = 1; ++ ++ get_kmsg_time(msg, amdgpu_error->timestamp); ++ ++ regex_group(matches, 1, msg, buf); ++ sscanf(buf, "%x:%x:%x.%x", ++ &amdgpu_error->seq, ++ &amdgpu_error->bus, ++ &amdgpu_error->dev, ++ &amdgpu_error->func); ++ ++ return; ++ } ++ ++ ret = regexec(re = &amdgpu_tracer->status, msg, 3, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, msg, buf); ++ amdgpu_error->status = strtoull(buf, NULL, 16); ++ ++ return; ++ } ++ ++ ret = regexec(re = &amdgpu_tracer->addr, msg, 3, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, msg, buf); ++ amdgpu_error->addr = strtoull(buf, NULL, 16); ++ ++ return; ++ } ++ ++ ret = regexec(re = &amdgpu_tracer->misc0, msg, 3, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, msg, buf); ++ amdgpu_error->misc0 = strtoull(buf, NULL, 16); ++ ++ return; ++ } ++ ++ ret = regexec(re = &amdgpu_tracer->ipid, msg, 3, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, msg, buf); ++ amdgpu_error->ipid = strtoull(buf, NULL, 16); ++ ++ return; ++ } ++ ++ ret = regexec(re = &amdgpu_tracer->synd, msg, 3, matches, 0); ++ if (ret > REG_NOMATCH) { ++ goto error; ++ } else if (!ret) { ++ regex_group(matches, 2, msg, buf); ++ amdgpu_error->synd = strtoull(buf, NULL, 16); ++ ++ report_amdgpu_error(amdgpu_error); ++ amdgpu_error->tracing = 0; ++ ++ return; ++ } ++ ++error: ++ if (ret == REG_NOMATCH) ++ return; ++ regerror(ret, re, buf, sizeof(buf)); ++ printf("Regex execution error: %s\n", buf); ++ return; ++} ++ ++int amdgpu_tracer_destroy(void) ++{ ++ log(ALL, LOG_INFO, "amdgpu tracer destroy\n"); ++ ++ if (!amdgpu_error) ++ free(amdgpu_error); ++ ++ if (!amdgpu_tracer) ++ free(amdgpu_tracer); ++ ++ return 0; ++} ++ ++static int init_reg(regex_t *re, const char *str) ++{ ++ char buf[128]; ++ int ret = 0; ++ ++ ret = regcomp(re, str, REG_EXTENDED); ++ if (ret) { ++ regerror(ret, re, buf, sizeof(buf)); ++ printf("Regex execution error: %s\n", buf); ++ return ret; ++ } ++ ++ return ret; ++} ++ ++int amdgpu_tracer_init(void) ++{ ++ char *s; ++ ++ s = getenv(AMDGPU_MCA_ENABLED); ++ if (!s || strcmp(s, "1")) ++ amdgpu_mca_enable = 0; ++ else ++ amdgpu_mca_enable = 1; ++ ++ amdgpu_error = calloc(1, sizeof(struct amdgpu_error)); ++ if (!amdgpu_error) ++ return -1; ++ ++ amdgpu_tracer = calloc(1, sizeof(struct amdgpu_tracer)); ++ if (!amdgpu_tracer) ++ return -1; ++ ++ if (init_reg(&amdgpu_tracer->header, AMDGPU_ERROR_HEADER) || ++ init_reg(&amdgpu_tracer->status, AMDGPU_ERROR_STATUS) || ++ init_reg(&amdgpu_tracer->addr, AMDGPU_ERROR_ADDR) || ++ init_reg(&amdgpu_tracer->misc0, AMDGPU_ERROR_MISC0) || ++ init_reg(&amdgpu_tracer->ipid, AMDGPU_ERROR_IPID) || ++ init_reg(&amdgpu_tracer->synd, AMDGPU_ERROR_SYND)) ++ log(ALL, LOG_ERR, "amdgpu tracer init failed\n"); ++ ++ return 0; ++} +\ No newline at end of file +diff --git a/ras-kmsg.c b/ras-kmsg.c +index 2dd47d6..deeb475 100644 +--- a/ras-kmsg.c ++++ b/ras-kmsg.c +@@ -72,6 +72,8 @@ int kmsg_tracer_destroy(void) + } + free(kmsg_tracer); + ++ amdgpu_tracer_destroy(); ++ + return 0; + } + +@@ -82,6 +84,8 @@ int kmsg_tracer_init(void) + char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp; + char *kmsg_tracer_group_key, *token; + ++ amdgpu_tracer_init(); ++ + s = getenv(KMSG_TRACE_END); + if (!s) + kmsg_trace_end = 0; +diff --git a/ras-kmsg.h b/ras-kmsg.h +index f31125f..9e34da5 100644 +--- a/ras-kmsg.h ++++ b/ras-kmsg.h +@@ -3,6 +3,7 @@ + #define __RAS_KMSG_H + + #include ++#include + + /** + * Kernel message tracer related definitions +@@ -40,8 +41,32 @@ struct kmsg_tracer_info { + } info; + }; + ++struct amdgpu_tracer { ++ regex_t header; ++ regex_t status; ++ regex_t addr; ++ regex_t misc0; ++ regex_t ipid; ++ regex_t synd; ++}; ++ ++struct amdgpu_error { ++ char timestamp[64]; ++ int seq, bus, dev, func; ++ int tracing; ++ uint64_t status; ++ uint64_t addr; ++ uint64_t misc0; ++ uint64_t ipid; ++ uint64_t synd; ++}; ++ + int kmsg_tracer_init(void); + int kmsg_tracer_destroy(void); + int kmsg_match(char *msg); + ++void amdgpu_tracer_match(char *msg); ++int amdgpu_tracer_destroy(void); ++int amdgpu_tracer_init(void); ++ + #endif +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index b61976a..fc2e8d4 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -491,6 +491,9 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event) + + trace_seq_init(&s); + report_mce_offline(&s, mce, priv); ++#ifdef HAVE_JSON_REPORT ++ report_mce_offline_json(&s, mce, event); ++#endif + trace_seq_do_printf(&s); + fflush(stdout); + trace_seq_destroy(&s); +diff --git a/ras-record.h b/ras-record.h +index 416f679..d0230f7 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -46,6 +46,9 @@ struct ras_mc_offline_event { + uint64_t ipid; + uint64_t synd; + uint64_t status; ++ uint64_t addr; ++ uint64_t misc0; ++ int domain, bus, device, function; + }; + + struct ras_aer_event { +diff --git a/ras-report-json.c b/ras-report-json.c +index e28cfac..577e856 100644 +--- a/ras-report-json.c ++++ b/ras-report-json.c +@@ -319,3 +319,84 @@ out: + trace_seq_destroy(&seq); + } + #endif ++ ++void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce, ++ struct ras_mc_offline_event *e) ++{ ++ char tmpbuf[128] = {0}, pci_name[128] = {0}; ++ u16 vendor, device; ++ ++ if (!s || !e || !mce || !json_report) ++ return; ++ ++ snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->domain, e->bus, e->device, e->function); ++ get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device); ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"amdgpu_ras_event\", " \ ++ "\"timestamp\": \"%s\", " \ ++ "\"bank_name\": \"%s\", " \ ++ "\"bank\": %d, " \ ++ "\"mcastatus_msg\": \"%s\", " \ ++ "\"mcistatus_msg\": \"%s\", " \ ++ "\"mc_location\": \"%s\", " \ ++ "\"error_msg\": \"%s\", " \ ++ "\"pci_bdf\": \"%s\", " \ ++ "\"pci_dev_name\": \"%s\", " \ ++ "\"vendor_id\": \"%#x\", " \ ++ "\"device_id\": \"%#x\", " \ ++ "\"status\": \"%#lx\", " \ ++ "\"addr\": \"%#lx\", " \ ++ "\"misc0\": \"%#lx\", " \ ++ "\"ipid\": \"%#lx\", " \ ++ "\"synd\": \"%#lx\" }\n", ++ JSON_REPORT_KEY, ++ (*mce->timestamp) ? mce->timestamp : NONE, ++ (*mce->bank_name) ? mce->bank_name : NONE, ++ mce->bank, ++ (*mce->mcastatus_msg) ? mce->mcastatus_msg : NONE, ++ (*mce->mcistatus_msg) ? mce->mcistatus_msg : NONE, ++ (*mce->mc_location) ? mce->mc_location : NONE, ++ (*mce->error_msg) ? mce->error_msg : NONE, ++ tmpbuf, pci_name, vendor, device, ++ e->status, e->addr, e->misc0, e->ipid, e->synd); ++} ++ ++void report_amdgpu_error_json(struct amdgpu_error *e) ++{ ++ struct trace_seq seq; ++ char tmpbuf[128] = {0}, pci_name[128] = {0}; ++ u16 vendor, device; ++ ++ if (!e || !json_report) ++ return; ++ ++ snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->seq, e->bus, e->dev, e->func); ++ get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device); ++ ++ trace_seq_init(&seq); ++ trace_seq_printf(&seq, ++ "\n{ \"%s\": \"amdgpu_ras_event\", " \ ++ "\"timestamp\": \"%s\", " \ ++ "\"pci_dev_name\": \"%s\", " \ ++ "\"vendor_id\": \"%#x\", " \ ++ "\"device_id\": \"%#x\", " \ ++ "\"status\": \"0x%#lx\", " \ ++ "\"addr\": \"0x%#lx\", " \ ++ "\"misc0\": \"0x%#lx\", " \ ++ "\"ipid\": \"0x%#lx\", " \ ++ "\"synd\": \"0x%#lx\" }", ++ JSON_REPORT_KEY, ++ (*e->timestamp) ? e->timestamp : "", ++ pci_name, vendor, device, ++ e->status, ++ e->addr, ++ e->misc0, ++ e->ipid, ++ e->synd); ++ ++ trace_seq_do_printf(&seq); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&seq); ++} +diff --git a/ras-report.h b/ras-report.h +index 7f7f304..7066a74 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -130,6 +130,9 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); + void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg); + void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev); ++void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce, ++ struct ras_mc_offline_event *e); ++void report_amdgpu_error_json(struct amdgpu_error *e); + #endif + + #endif +-- +2.43.5 + diff --git a/1021-anolis-config-disable-page-offline-defalut.patch b/1021-anolis-config-disable-page-offline-defalut.patch new file mode 100644 index 0000000000000000000000000000000000000000..880db2207a175d13ecab251b40fa4458d5e085e4 --- /dev/null +++ b/1021-anolis-config-disable-page-offline-defalut.patch @@ -0,0 +1,26 @@ +From 344b4080d5d093123de8973b74f8289201931483 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Mon, 10 Mar 2025 11:27:45 +0800 +Subject: [PATCH 21/30] anolis: config: disable page offline defalut + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 2816505..1833f1b 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -54,7 +54,7 @@ ROW_CE_ACTION="off" + # Requires an uptodate kernel. Might not be successfull. + # soft-then-hard First try to soft offline, then try hard offlining. + # Note: default offline choice is "soft". +-PAGE_CE_ACTION="soft" ++PAGE_CE_ACTION="off" + + # CPU Online Fault Isolation + # Whether to enable cpu online fault isolation (yes|no). +-- +2.43.5 + diff --git a/1022-anolis-disable-block-and-dev-error-default.patch b/1022-anolis-disable-block-and-dev-error-default.patch new file mode 100644 index 0000000000000000000000000000000000000000..83ba86cd54d1979765171b83de7e5a2664843225 --- /dev/null +++ b/1022-anolis-disable-block-and-dev-error-default.patch @@ -0,0 +1,26 @@ +From b5d1f625e8cee3697965e975483e523543d38b4b Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 12 Mar 2025 09:59:55 +0800 +Subject: [PATCH 22/30] anolis: disable block and dev error default + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 1833f1b..198b050 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect + CPU_ISOLATION_LIMIT="10" + +-DISABLE="json_report,kmsg_monitor" ++DISABLE="json_report,kmsg_monitor,block:block_rq_complete,devlink:devlink_health_report" + + # Event Trigger + +-- +2.43.5 + diff --git a/1023-anolis-add-nvml-in-tree.patch b/1023-anolis-add-nvml-in-tree.patch new file mode 100644 index 0000000000000000000000000000000000000000..9d0718c05f30299dadfa9d37d93ee95acb325f77 --- /dev/null +++ b/1023-anolis-add-nvml-in-tree.patch @@ -0,0 +1,11441 @@ +From 46af414d74baab0e03d716e3af7e77ea3186c47e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 17 Apr 2025 17:17:55 +0800 +Subject: [PATCH 23/30] anolis: add nvml in tree + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/nvml.h | 11370 ++++++++++++++++++++++++++++++++++++++++++++++ + contrib/nvml.py | 13 +- + 3 files changed, 11381 insertions(+), 3 deletions(-) + create mode 100644 contrib/nvml.h + +diff --git a/Makefile.am b/Makefile.am +index 328fa49..4aba962 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -25,6 +25,7 @@ EXTRA_DIST = \ + misc/rasdaemon.env \ + misc/notices \ + contrib/nvml.py \ ++ contrib/nvml.h \ + contrib/*_trigger + + CLEANFILES= \ +diff --git a/contrib/nvml.h b/contrib/nvml.h +new file mode 100644 +index 0000000..937332e +--- /dev/null ++++ b/contrib/nvml.h +@@ -0,0 +1,11370 @@ ++/* ++ * Copyright 1993-2024 NVIDIA Corporation. All rights reserved. ++ * ++ * NOTICE TO USER: ++ * ++ * This source code is subject to NVIDIA ownership rights under U.S. and ++ * international Copyright laws. Users and possessors of this source code ++ * are hereby granted a nonexclusive, royalty-free license to use this code ++ * in individual and commercial software. ++ * ++ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE ++ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR ++ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH ++ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. ++ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, ++ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS ++ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE ++ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE ++ * OR PERFORMANCE OF THIS SOURCE CODE. ++ * ++ * U.S. Government End Users. This source code is a "commercial item" as ++ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of ++ * "commercial computer software" and "commercial computer software ++ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) ++ * and is provided to the U.S. Government only as a commercial end item. ++ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through ++ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the ++ * source code with only those rights set forth herein. ++ * ++ * Any use of this source code in individual and commercial software must ++ * include, in the user documentation and internal comments to the code, ++ * the above Disclaimer and U.S. Government End Users Notice. ++ */ ++ ++/* ++NVML API Reference ++ ++The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and ++managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building ++3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi ++tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. ++ ++API Documentation ++ ++Supported platforms: ++- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit ++- Linux: 32-bit and 64-bit ++- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 ++ ++Supported products: ++- Full Support ++ - All Tesla products, starting with the Fermi architecture ++ - All Quadro products, starting with the Fermi architecture ++ - All vGPU Software products, starting with the Kepler architecture ++ - Selected GeForce Titan products ++- Limited Support ++ - All Geforce products, starting with the Fermi architecture ++ ++The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is ++not be added to the system path by default. To dynamically link to NVML, add this path to the PATH ++environmental variable. To dynamically load NVML, call LoadLibrary with this path. ++ ++On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit ++and 64 bit NVML libraries will be installed. ++ ++Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html ++*/ ++ ++#ifndef __nvml_nvml_h__ ++#define __nvml_nvml_h__ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * On Windows, set up methods for DLL export ++ * define NVML_STATIC_IMPORT when using nvml_loader library ++ */ ++#if defined _WINDOWS ++ #if !defined NVML_STATIC_IMPORT ++ #if defined NVML_LIB_EXPORT ++ #define DECLDIR __declspec(dllexport) ++ #else ++ #define DECLDIR __declspec(dllimport) ++ #endif ++ #else ++ #define DECLDIR ++ #endif ++#else ++ #define DECLDIR ++#endif ++ ++ #define NVML_MCDM_SUPPORT ++ ++/** ++ * NVML API versioning support ++ */ ++#define NVML_API_VERSION 12 ++#define NVML_API_VERSION_STR "12" ++/** ++ * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. ++ * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this ++ * guard if you need to support older versions of the API ++ */ ++#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS ++ #define nvmlInit nvmlInit_v2 ++ #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 ++ #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 ++ #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 ++ #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 ++ #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 ++ #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 ++ #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v4 ++ #define nvmlEventSetWait nvmlEventSetWait_v2 ++ #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 ++ #define nvmlComputeInstanceGetInfo nvmlComputeInstanceGetInfo_v2 ++ #define nvmlDeviceGetComputeRunningProcesses nvmlDeviceGetComputeRunningProcesses_v3 ++ #define nvmlDeviceGetGraphicsRunningProcesses nvmlDeviceGetGraphicsRunningProcesses_v3 ++ #define nvmlDeviceGetMPSComputeRunningProcesses nvmlDeviceGetMPSComputeRunningProcesses_v3 ++ #define nvmlBlacklistDeviceInfo_t nvmlExcludedDeviceInfo_t ++ #define nvmlGetBlacklistDeviceCount nvmlGetExcludedDeviceCount ++ #define nvmlGetBlacklistDeviceInfoByIndex nvmlGetExcludedDeviceInfoByIndex ++ #define nvmlDeviceGetGpuInstancePossiblePlacements nvmlDeviceGetGpuInstancePossiblePlacements_v2 ++ #define nvmlVgpuInstanceGetLicenseInfo nvmlVgpuInstanceGetLicenseInfo_v2 ++ #define nvmlDeviceGetDriverModel nvmlDeviceGetDriverModel_v2 ++#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS ++ ++#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \ ++ (ver << 24U)) ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceStructs Device Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Special constant that some fields take when they are not available. ++ * Used when only part of the struct is not available. ++ * ++ * Each structure explicitly states when to check for this value. ++ */ ++#define NVML_VALUE_NOT_AVAILABLE (-1) ++ ++typedef struct nvmlDevice_st* nvmlDevice_t; ++ ++/** ++ * Buffer size guaranteed to be large enough for pci bus id ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 ++ ++/** ++ * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 ++ ++/** ++ * PCI information about a GPU device. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff ++ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff ++ unsigned int device; //!< The device's id on the bus, 0 to 31 ++ ++ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id ++ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID ++ ++ unsigned int baseClass; //!< The 8-bit PCI base class code ++ unsigned int subClass; //!< The 8-bit PCI sub class code ++ ++ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) ++} nvmlPciInfoExt_v1_t; ++typedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t; ++#define nvmlPciInfoExt_v1 NVML_STRUCT_VERSION(PciInfoExt, 1) ++ ++/** ++ * PCI information about a GPU device. ++ */ ++typedef struct nvmlPciInfo_st ++{ ++ char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) ++ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff ++ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff ++ unsigned int device; //!< The device's id on the bus, 0 to 31 ++ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id ++ ++ // Added in NVML 2.285 API ++ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID ++ ++ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) ++} nvmlPciInfo_t; ++ ++/** ++ * PCI format string for ::busIdLegacy ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" ++ ++/** ++ * PCI format string for ::busId ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" ++ ++/** ++ * Utility macro for filling the pci bus id format from a nvmlPciInfo_t ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ ++ (pciInfo)->bus, \ ++ (pciInfo)->device ++ ++/** ++ * Detailed ECC error counts for a device. ++ * ++ * @deprecated Different GPU families can have different memory error counters ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ */ ++typedef struct nvmlEccErrorCounts_st ++{ ++ unsigned long long l1Cache; //!< L1 cache errors ++ unsigned long long l2Cache; //!< L2 cache errors ++ unsigned long long deviceMemory; //!< Device memory errors ++ unsigned long long registerFile; //!< Register file errors ++} nvmlEccErrorCounts_t; ++ ++/** ++ * Utilization information for a device. ++ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. ++ */ ++typedef struct nvmlUtilization_st ++{ ++ unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU ++ unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written ++} nvmlUtilization_t; ++ ++/** ++ * Memory allocation information for a device (v1). ++ * The total amount is equal to the sum of the amounts of free and used memory. ++ */ ++typedef struct nvmlMemory_st ++{ ++ unsigned long long total; //!< Total physical device memory (in bytes) ++ unsigned long long free; //!< Unallocated device memory (in bytes) ++ unsigned long long used; //!< Sum of Reserved and Allocated device memory (in bytes). ++ //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping ++} nvmlMemory_t; ++ ++/** ++ * Memory allocation information for a device (v2). ++ * ++ * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output. ++ */ ++typedef struct nvmlMemory_v2_st ++{ ++ unsigned int version; //!< Structure format version (must be 2) ++ unsigned long long total; //!< Total physical device memory (in bytes) ++ unsigned long long reserved; //!< Device memory (in bytes) reserved for system use (driver or firmware) ++ unsigned long long free; //!< Unallocated device memory (in bytes) ++ unsigned long long used; //!< Allocated device memory (in bytes). ++} nvmlMemory_v2_t; ++ ++#define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2) ++ ++/** ++ * BAR1 Memory allocation Information for a device ++ */ ++typedef struct nvmlBAR1Memory_st ++{ ++ unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) ++ unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) ++ unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) ++}nvmlBAR1Memory_t; ++ ++/** ++ * Information about running compute processes on the GPU, legacy version ++ * for older versions of the API. ++ */ ++typedef struct nvmlProcessInfo_v1_st ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++} nvmlProcessInfo_v1_t; ++ ++/** ++ * Information about running compute processes on the GPU ++ */ ++typedef struct nvmlProcessInfo_v2_st ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++ unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to ++ // 0xFFFFFFFF otherwise. ++ unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to ++ // 0xFFFFFFFF otherwise. ++} nvmlProcessInfo_v2_t, nvmlProcessInfo_t; ++ ++/** ++ * Information about running process on the GPU with protected memory ++ */ ++typedef struct ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++ unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is ++ // set to 0xFFFFFFFF otherwise. ++ unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId ++ // is set to 0xFFFFFFFF otherwise. ++ unsigned long long usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes. ++} nvmlProcessDetail_v1_t; ++ ++/** ++ * Information about all running processes on the GPU for the given mode ++ */ ++typedef struct ++{ ++ unsigned int version; //!< Struct version, MUST be nvmlProcessDetailList_v1 ++ unsigned int mode; //!< Process mode(Compute/Graphics/MPSCompute) ++ unsigned int numProcArrayEntries; //!< Number of process entries in procArray ++ nvmlProcessDetail_v1_t *procArray; //!< Process array ++} nvmlProcessDetailList_v1_t; ++ ++typedef nvmlProcessDetailList_v1_t nvmlProcessDetailList_t; ++ ++/** ++ * nvmlProcessDetailList version ++ */ ++#define nvmlProcessDetailList_v1 NVML_STRUCT_VERSION(ProcessDetailList, 1) ++ ++typedef struct nvmlDeviceAttributes_st ++{ ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ unsigned int gpuInstanceSliceCount; //!< GPU instance slice count ++ unsigned int computeInstanceSliceCount; //!< Compute instance slice count ++ unsigned long long memorySizeMB; //!< Device memory size (in MiB) ++} nvmlDeviceAttributes_t; ++ ++/** ++ * C2C Mode information for a device ++ */ ++typedef struct ++{ ++ unsigned int isC2cEnabled; ++} nvmlC2cModeInfo_v1_t; ++ ++#define nvmlC2cModeInfo_v1 NVML_STRUCT_VERSION(C2cModeInfo, 1) ++ ++/** ++ * Possible values that classify the remap availability for each bank. The max ++ * field will contain the number of banks that have maximum remap availability ++ * (all reserved rows are available). None means that there are no reserved ++ * rows available. ++ */ ++typedef struct nvmlRowRemapperHistogramValues_st ++{ ++ unsigned int max; ++ unsigned int high; ++ unsigned int partial; ++ unsigned int low; ++ unsigned int none; ++} nvmlRowRemapperHistogramValues_t; ++ ++/** ++ * Enum to represent type of bridge chip ++ */ ++typedef enum nvmlBridgeChipType_enum ++{ ++ NVML_BRIDGE_CHIP_PLX = 0, ++ NVML_BRIDGE_CHIP_BRO4 = 1 ++}nvmlBridgeChipType_t; ++ ++/** ++ * Maximum number of NvLink links supported ++ */ ++#define NVML_NVLINK_MAX_LINKS 18 ++ ++/** ++ * Enum to represent the NvLink utilization counter packet units ++ */ ++typedef enum nvmlNvLinkUtilizationCountUnits_enum ++{ ++ NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles ++ NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets ++ NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes ++ NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use ++ // this must be last ++ NVML_NVLINK_COUNTER_UNIT_COUNT ++} nvmlNvLinkUtilizationCountUnits_t; ++ ++/** ++ * Enum to represent the NvLink utilization counter packet types to count ++ * ** this is ONLY applicable with the units as packets or bytes ++ * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t ++ * ** all packet filter descriptions are target GPU centric ++ * ** these can be "OR'd" together ++ */ ++typedef enum nvmlNvLinkUtilizationCountPktTypes_enum ++{ ++ NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets ++ NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets ++ NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets ++ NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests ++ NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests ++ NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests ++ NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data ++ NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data ++ NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets ++} nvmlNvLinkUtilizationCountPktTypes_t; ++ ++/** ++ * Struct to define the NVLINK counter controls ++ */ ++typedef struct nvmlNvLinkUtilizationControl_st ++{ ++ nvmlNvLinkUtilizationCountUnits_t units; ++ nvmlNvLinkUtilizationCountPktTypes_t pktfilter; ++} nvmlNvLinkUtilizationControl_t; ++ ++/** ++ * Enum to represent NvLink queryable capabilities ++ */ ++typedef enum nvmlNvLinkCapability_enum ++{ ++ NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported ++ NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported ++ NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported ++ NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported ++ NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link ++ NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device ++ // should be last ++ NVML_NVLINK_CAP_COUNT ++} nvmlNvLinkCapability_t; ++ ++/** ++ * Enum to represent NvLink queryable error counters ++ */ ++typedef enum nvmlNvLinkErrorCounter_enum ++{ ++ NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter ++ NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter ++ NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter ++ NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter ++ NVML_NVLINK_ERROR_DL_ECC_DATA = 4, // Data link receive data ECC error counter ++ ++ // this must be last ++ NVML_NVLINK_ERROR_COUNT ++} nvmlNvLinkErrorCounter_t; ++ ++/** ++ * Enum to represent NvLink's remote device type ++ */ ++typedef enum nvmlIntNvLinkDeviceType_enum ++{ ++ NVML_NVLINK_DEVICE_TYPE_GPU = 0x00, ++ NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01, ++ NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02, ++ NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF ++} nvmlIntNvLinkDeviceType_t; ++ ++/** ++ * Represents level relationships within a system between two GPUs ++ * The enums are spaced to allow for future relationships ++ */ ++typedef enum nvmlGpuLevel_enum ++{ ++ NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 ++ NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch ++ NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge ++ NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge ++ NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges ++ NVML_TOPOLOGY_SYSTEM = 50 // all devices in the system ++ ++ // there is purposefully no COUNT here because of the need for spacing above ++} nvmlGpuTopologyLevel_t; ++ ++/* Compatibility for CPU->NODE renaming */ ++#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE ++ ++/* P2P Capability Index Status*/ ++typedef enum nvmlGpuP2PStatus_enum ++{ ++ NVML_P2P_STATUS_OK = 0, ++ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, ++ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, ++ NVML_P2P_STATUS_GPU_NOT_SUPPORTED, ++ NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, ++ NVML_P2P_STATUS_DISABLED_BY_REGKEY, ++ NVML_P2P_STATUS_NOT_SUPPORTED, ++ NVML_P2P_STATUS_UNKNOWN ++ ++} nvmlGpuP2PStatus_t; ++ ++/* P2P Capability Index*/ ++typedef enum nvmlGpuP2PCapsIndex_enum ++{ ++ NVML_P2P_CAPS_INDEX_READ = 0, ++ NVML_P2P_CAPS_INDEX_WRITE = 1, ++ NVML_P2P_CAPS_INDEX_NVLINK = 2, ++ NVML_P2P_CAPS_INDEX_ATOMICS = 3, ++ NVML_P2P_CAPS_INDEX_PCI = 4, ++ /* ++ * DO NOT USE! NVML_P2P_CAPS_INDEX_PROP is deprecated. ++ * Use NVML_P2P_CAPS_INDEX_PCI instead. ++ */ ++ NVML_P2P_CAPS_INDEX_PROP = NVML_P2P_CAPS_INDEX_PCI, ++ NVML_P2P_CAPS_INDEX_UNKNOWN = 5, ++}nvmlGpuP2PCapsIndex_t; ++ ++/** ++ * Maximum limit on Physical Bridges per Board ++ */ ++#define NVML_MAX_PHYSICAL_BRIDGE (128) ++ ++/** ++ * Information about the Bridge Chip Firmware ++ */ ++typedef struct nvmlBridgeChipInfo_st ++{ ++ nvmlBridgeChipType_t type; //!< Type of Bridge Chip ++ unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable ++}nvmlBridgeChipInfo_t; ++ ++/** ++ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate ++ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. ++ */ ++typedef struct nvmlBridgeChipHierarchy_st ++{ ++ unsigned char bridgeCount; //!< Number of Bridge Chips on the Board ++ nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board ++}nvmlBridgeChipHierarchy_t; ++ ++/** ++ * Represents Type of Sampling Event ++ */ ++typedef enum nvmlSamplingType_enum ++{ ++ NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU ++ NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU ++ NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written ++ NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy ++ NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy ++ NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples ++ NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples ++ NVML_MODULE_POWER_SAMPLES = 7, //!< To represent module power samples for total module starting Grace Hopper ++ NVML_JPG_UTILIZATION_SAMPLES = 8, //!< To represent percent of time during which NVJPG remains busy ++ NVML_OFA_UTILIZATION_SAMPLES = 9, //!< To represent percent of time during which NVOFA remains busy ++ ++ // Keep this last ++ NVML_SAMPLINGTYPE_COUNT ++}nvmlSamplingType_t; ++ ++/** ++ * Represents the queryable PCIe utilization counters ++ */ ++typedef enum nvmlPcieUtilCounter_enum ++{ ++ NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity ++ NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity ++ ++ // Keep this last ++ NVML_PCIE_UTIL_COUNT ++} nvmlPcieUtilCounter_t; ++ ++/** ++ * Represents the type for sample value returned ++ */ ++typedef enum nvmlValueType_enum ++{ ++ NVML_VALUE_TYPE_DOUBLE = 0, ++ NVML_VALUE_TYPE_UNSIGNED_INT = 1, ++ NVML_VALUE_TYPE_UNSIGNED_LONG = 2, ++ NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, ++ NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, ++ NVML_VALUE_TYPE_SIGNED_INT = 5, ++ NVML_VALUE_TYPE_UNSIGNED_SHORT = 6, ++ ++ // Keep this last ++ NVML_VALUE_TYPE_COUNT ++}nvmlValueType_t; ++ ++ ++/** ++ * Union to represent different types of Value ++ */ ++typedef union nvmlValue_st ++{ ++ double dVal; //!< If the value is double ++ int siVal; //!< If the value is signed int ++ unsigned int uiVal; //!< If the value is unsigned int ++ unsigned long ulVal; //!< If the value is unsigned long ++ unsigned long long ullVal; //!< If the value is unsigned long long ++ signed long long sllVal; //!< If the value is signed long long ++ unsigned short usVal; //!< If the value is unsigned short ++}nvmlValue_t; ++ ++/** ++ * Information for Sample ++ */ ++typedef struct nvmlSample_st ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlValue_t sampleValue; //!< Sample Value ++}nvmlSample_t; ++ ++/** ++ * Represents type of perf policy for which violation times can be queried ++ */ ++typedef enum nvmlPerfPolicyType_enum ++{ ++ NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks ++ NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks ++ NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks ++ NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks ++ NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks ++ NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks ++ ++ NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) ++ NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks ++ ++ // Keep this last ++ NVML_PERF_POLICY_COUNT ++}nvmlPerfPolicyType_t; ++ ++/** ++ * Struct to hold perf policy violation status data ++ */ ++typedef struct nvmlViolationTime_st ++{ ++ unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds ++ unsigned long long violationTime; //!< violationTime in Nanoseconds ++}nvmlViolationTime_t; ++ ++#define NVML_MAX_THERMAL_SENSORS_PER_GPU 3 ++ ++/** ++ * Represents the thermal sensor targets ++ */ ++typedef enum ++{ ++ NVML_THERMAL_TARGET_NONE = 0, ++ NVML_THERMAL_TARGET_GPU = 1, //!< GPU core temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_MEMORY = 2, //!< GPU memory temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_POWER_SUPPLY = 4, //!< GPU power supply temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_BOARD = 8, //!< GPU board ambient temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_VCD_BOARD = 9, //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle ++ NVML_THERMAL_TARGET_VCD_INLET = 10, //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle ++ NVML_THERMAL_TARGET_VCD_OUTLET = 11, //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle ++ ++ NVML_THERMAL_TARGET_ALL = 15, ++ NVML_THERMAL_TARGET_UNKNOWN = -1, ++} nvmlThermalTarget_t; ++ ++/** ++ * Represents the thermal sensor controllers ++ */ ++typedef enum ++{ ++ NVML_THERMAL_CONTROLLER_NONE = 0, ++ NVML_THERMAL_CONTROLLER_GPU_INTERNAL, ++ NVML_THERMAL_CONTROLLER_ADM1032, ++ NVML_THERMAL_CONTROLLER_ADT7461, ++ NVML_THERMAL_CONTROLLER_MAX6649, ++ NVML_THERMAL_CONTROLLER_MAX1617, ++ NVML_THERMAL_CONTROLLER_LM99, ++ NVML_THERMAL_CONTROLLER_LM89, ++ NVML_THERMAL_CONTROLLER_LM64, ++ NVML_THERMAL_CONTROLLER_G781, ++ NVML_THERMAL_CONTROLLER_ADT7473, ++ NVML_THERMAL_CONTROLLER_SBMAX6649, ++ NVML_THERMAL_CONTROLLER_VBIOSEVT, ++ NVML_THERMAL_CONTROLLER_OS, ++ NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS, ++ NVML_THERMAL_CONTROLLER_NVSYSCON_E551, ++ NVML_THERMAL_CONTROLLER_MAX6649R, ++ NVML_THERMAL_CONTROLLER_ADT7473S, ++ NVML_THERMAL_CONTROLLER_UNKNOWN = -1, ++} nvmlThermalController_t; ++ ++/** ++ * Struct to hold the thermal sensor settings ++ */ ++typedef struct ++{ ++ unsigned int count; ++ struct ++ { ++ nvmlThermalController_t controller; ++ int defaultMinTemp; ++ int defaultMaxTemp; ++ int currentTemp; ++ nvmlThermalTarget_t target; ++ } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; ++ ++} nvmlGpuThermalSettings_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceEnums Device Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Generic enable/disable enum. ++ */ ++typedef enum nvmlEnableState_enum ++{ ++ NVML_FEATURE_DISABLED = 0, //!< Feature disabled ++ NVML_FEATURE_ENABLED = 1 //!< Feature enabled ++} nvmlEnableState_t; ++ ++//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. ++#define nvmlFlagDefault 0x00 ++//! Generic flag used to force some behavior. See description of particular functions for details. ++#define nvmlFlagForce 0x01 ++ ++/** ++ * * The Brand of the GPU ++ * */ ++typedef enum nvmlBrandType_enum ++{ ++ NVML_BRAND_UNKNOWN = 0, ++ NVML_BRAND_QUADRO = 1, ++ NVML_BRAND_TESLA = 2, ++ NVML_BRAND_NVS = 3, ++ NVML_BRAND_GRID = 4, // Deprecated from API reporting. Keeping definition for backward compatibility. ++ NVML_BRAND_GEFORCE = 5, ++ NVML_BRAND_TITAN = 6, ++ NVML_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications ++ NVML_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC ++ NVML_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server ++ NVML_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation ++ NVML_BRAND_NVIDIA_CLOUD_GAMING = 11, // NVIDIA Cloud Gaming ++ NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING, // Deprecated from API reporting. Keeping definition for backward compatibility. ++ NVML_BRAND_QUADRO_RTX = 12, ++ NVML_BRAND_NVIDIA_RTX = 13, ++ NVML_BRAND_NVIDIA = 14, ++ NVML_BRAND_GEFORCE_RTX = 15, // Unused ++ NVML_BRAND_TITAN_RTX = 16, // Unused ++ ++ // Keep this last ++ NVML_BRAND_COUNT ++} nvmlBrandType_t; ++ ++/** ++ * Temperature thresholds. ++ */ ++typedef enum nvmlTemperatureThresholds_enum ++{ ++ NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will ++ // shut down for HW protection ++ NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will ++ // begin HW slowdown ++ NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will ++ // begin SW slowdown ++ NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU ++ // can be throttled below base clock ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4, // Minimum GPU Temperature that can be ++ // set as acoustic threshold ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, // Current temperature that is set as ++ // acoustic threshold. ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6, // Maximum GPU temperature that can be ++ // set as acoustic threshold. ++ NVML_TEMPERATURE_THRESHOLD_GPS_CURR = 7, // Current temperature that is set as ++ // gps threshold. ++ // Keep this last ++ NVML_TEMPERATURE_THRESHOLD_COUNT ++} nvmlTemperatureThresholds_t; ++ ++/** ++ * Temperature sensors. ++ */ ++typedef enum nvmlTemperatureSensors_enum ++{ ++ NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die ++ ++ // Keep this last ++ NVML_TEMPERATURE_COUNT ++} nvmlTemperatureSensors_t; ++ ++/** ++ * Compute mode. ++ * ++ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. ++ * Earlier CUDA versions supported a single exclusive mode, ++ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. ++ */ ++typedef enum nvmlComputeMode_enum ++{ ++ NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device ++ NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed ++ NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device ++ NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time ++ ++ // Keep this last ++ NVML_COMPUTEMODE_COUNT ++} nvmlComputeMode_t; ++ ++/** ++ * Max Clock Monitors available ++ */ ++#define MAX_CLK_DOMAINS 32 ++ ++/** ++ * Clock Monitor error types ++ */ ++typedef struct nvmlClkMonFaultInfo_struct { ++ /** ++ * The Domain which faulted ++ */ ++ unsigned int clkApiDomain; ++ ++ /** ++ * Faults Information ++ */ ++ unsigned int clkDomainFaultMask; ++} nvmlClkMonFaultInfo_t; ++ ++/** ++ * Clock Monitor Status ++ */ ++typedef struct nvmlClkMonStatus_status { ++ /** ++ * Fault status Indicator ++ */ ++ unsigned int bGlobalStatus; ++ ++ /** ++ * Total faulted domain numbers ++ */ ++ unsigned int clkMonListSize; ++ ++ /** ++ * The fault Information structure ++ */ ++ nvmlClkMonFaultInfo_t clkMonList[MAX_CLK_DOMAINS]; ++} nvmlClkMonStatus_t; ++ ++/** ++ * ECC bit types. ++ * ++ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type ++ */ ++#define nvmlEccBitType_t nvmlMemoryErrorType_t ++ ++/** ++ * Single bit ECC errors ++ * ++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED ++ */ ++#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED ++ ++/** ++ * Double bit ECC errors ++ * ++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED ++ */ ++#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED ++ ++/** ++ * Memory error types ++ */ ++typedef enum nvmlMemoryErrorType_enum ++{ ++ /** ++ * A memory error that was corrected ++ * ++ * For ECC errors, these are single bit errors ++ * For Texture memory, these are errors fixed by resend ++ */ ++ NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, ++ /** ++ * A memory error that was not corrected ++ * ++ * For ECC errors, these are double bit errors ++ * For Texture memory, these are errors where the resend fails ++ */ ++ NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, ++ ++ ++ // Keep this last ++ NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types ++ ++} nvmlMemoryErrorType_t; ++ ++/** ++ * ECC counter types. ++ * ++ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. ++ * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver ++ * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app ++ * is run. ++ */ ++typedef enum nvmlEccCounterType_enum ++{ ++ NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. ++ NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) ++ ++ // Keep this last ++ NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types ++} nvmlEccCounterType_t; ++ ++/** ++ * Clock types. ++ * ++ * All speeds are in Mhz. ++ */ ++typedef enum nvmlClockType_enum ++{ ++ NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain ++ NVML_CLOCK_SM = 1, //!< SM clock domain ++ NVML_CLOCK_MEM = 2, //!< Memory clock domain ++ NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain ++ ++ // Keep this last ++ NVML_CLOCK_COUNT //!< Count of clock types ++} nvmlClockType_t; ++ ++/** ++ * Clock Ids. These are used in combination with nvmlClockType_t ++ * to specify a single clock value. ++ */ ++typedef enum nvmlClockId_enum ++{ ++ NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value ++ NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock ++ NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target ++ NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate ++ ++ //Keep this last ++ NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. ++} nvmlClockId_t; ++ ++/** ++ * Driver models. ++ * ++ * Windows only. ++ */ ++ ++typedef enum nvmlDriverModel_enum ++{ ++ NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device ++ NVML_DRIVER_WDM = 1, //!< WDM (TCC) model (deprecated) -- GPU treated as a generic compute device ++ NVML_DRIVER_MCDM = 2 //!< MCDM driver model -- GPU treated as a Microsoft compute device ++} nvmlDriverModel_t; ++ ++#define NVML_MAX_GPU_PERF_PSTATES 16 ++ ++/** ++ * Allowed PStates. ++ */ ++typedef enum nvmlPStates_enum ++{ ++ NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance ++ NVML_PSTATE_1 = 1, //!< Performance state 1 ++ NVML_PSTATE_2 = 2, //!< Performance state 2 ++ NVML_PSTATE_3 = 3, //!< Performance state 3 ++ NVML_PSTATE_4 = 4, //!< Performance state 4 ++ NVML_PSTATE_5 = 5, //!< Performance state 5 ++ NVML_PSTATE_6 = 6, //!< Performance state 6 ++ NVML_PSTATE_7 = 7, //!< Performance state 7 ++ NVML_PSTATE_8 = 8, //!< Performance state 8 ++ NVML_PSTATE_9 = 9, //!< Performance state 9 ++ NVML_PSTATE_10 = 10, //!< Performance state 10 ++ NVML_PSTATE_11 = 11, //!< Performance state 11 ++ NVML_PSTATE_12 = 12, //!< Performance state 12 ++ NVML_PSTATE_13 = 13, //!< Performance state 13 ++ NVML_PSTATE_14 = 14, //!< Performance state 14 ++ NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance ++ NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state ++} nvmlPstates_t; ++ ++/** ++ * Clock offset info. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ nvmlClockType_t type; ++ nvmlPstates_t pstate; ++ int clockOffsetMHz; ++ int minClockOffsetMHz; ++ int maxClockOffsetMHz; ++} nvmlClockOffset_v1_t; ++ ++typedef nvmlClockOffset_v1_t nvmlClockOffset_t; ++ ++#define nvmlClockOffset_v1 NVML_STRUCT_VERSION(ClockOffset, 1) ++ ++/** ++ * GPU Operation Mode ++ * ++ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. ++ * ++ * Each GOM is designed to meet specific user needs. ++ */ ++typedef enum nvmlGom_enum ++{ ++ NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed ++ ++ NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations ++ //!< are not allowed ++ ++ NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require ++ //!< high bandwidth double precision ++} nvmlGpuOperationMode_t; ++ ++/** ++ * Available infoROM objects. ++ */ ++typedef enum nvmlInforomObject_enum ++{ ++ NVML_INFOROM_OEM = 0, //!< An object defined by OEM ++ NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support ++ NVML_INFOROM_POWER = 2, //!< The power management object ++ ++ // Keep this last ++ NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about ++} nvmlInforomObject_t; ++ ++/** ++ * Return values for NVML API calls. ++ */ ++typedef enum nvmlReturn_enum ++{ ++ // cppcheck-suppress * ++ NVML_SUCCESS = 0, //!< The operation was successful ++ NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() ++ NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid ++ NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device ++ NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation ++ NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting ++ NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful ++ NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough ++ NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached ++ NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded ++ NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed ++ NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU ++ NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded ++ NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function ++ NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted ++ NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible ++ NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again ++ NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups ++ NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch ++ NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use ++ NVML_ERROR_MEMORY = 20, //!< Insufficient memory ++ NVML_ERROR_NO_DATA = 21, //!< No data ++ NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled ++ NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory ++ NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory ++ NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported ++ NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated ++ NVML_ERROR_NOT_READY = 27, //!< The system is not ready for the request ++ NVML_ERROR_GPU_NOT_FOUND = 28, //!< No GPUs were found ++ NVML_ERROR_INVALID_STATE = 29, //!< Resource not in correct state to perform requested operation ++ NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred ++} nvmlReturn_t; ++ ++/** ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ */ ++typedef enum nvmlMemoryLocation_enum ++{ ++ NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache ++ NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache ++ NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM ++ NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory ++ NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File ++ NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory ++ NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory ++ NVML_MEMORY_LOCATION_CBU = 6, //!< CBU ++ NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM ++ // Keep this last ++ NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about ++} nvmlMemoryLocation_t; ++ ++/** ++ * Causes for page retirement ++ */ ++typedef enum nvmlPageRetirementCause_enum ++{ ++ NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error ++ NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error ++ ++ // Keep this last ++ NVML_PAGE_RETIREMENT_CAUSE_COUNT ++} nvmlPageRetirementCause_t; ++ ++/** ++ * API types that allow changes to default permission restrictions ++ */ ++typedef enum nvmlRestrictedAPI_enum ++{ ++ NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks ++ //!< and see nvmlDeviceResetApplicationsClocks ++ NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks ++ //!< see nvmlDeviceSetAutoBoostedClocksEnabled ++ // Keep this last ++ NVML_RESTRICTED_API_COUNT ++} nvmlRestrictedAPI_t; ++ ++/** ++ * Structure to store utilization value and process Id ++ */ ++typedef struct nvmlProcessUtilizationSample_st ++{ ++ unsigned int pid; //!< PID of process ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++} nvmlProcessUtilizationSample_t; ++ ++/** ++ * Structure to store utilization value and process Id -- version 1 ++ */ ++typedef struct ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int pid; //!< PID of process ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++ unsigned int jpgUtil; //!< Jpeg Util Value ++ unsigned int ofaUtil; //!< Ofa Util Value ++} nvmlProcessUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store utilization and process ID for each running process -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int processSamplesCount; //!< Caller-supplied array size, and returns number of processes running ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlProcessUtilizationInfo_v1_t *procUtilArray; //!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA ++} nvmlProcessesUtilizationInfo_v1_t; ++typedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t; ++#define nvmlProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(ProcessesUtilizationInfo, 1) ++ ++/** ++ * Structure to store SRAM uncorrectable error counters ++ */ ++typedef struct ++{ ++ unsigned int version; //!< the API version number ++ unsigned long long aggregateUncParity; //!< aggregate uncorrectable parity error count ++ unsigned long long aggregateUncSecDed; //!< aggregate uncorrectable SEC-DED error count ++ unsigned long long aggregateCor; //!< aggregate correctable error count ++ unsigned long long volatileUncParity; //!< volatile uncorrectable parity error count ++ unsigned long long volatileUncSecDed; //!< volatile uncorrectable SEC-DED error count ++ unsigned long long volatileCor; //!< volatile correctable error count ++ unsigned long long aggregateUncBucketL2; //!< aggregate uncorrectable error count for L2 cache bucket ++ unsigned long long aggregateUncBucketSm; //!< aggregate uncorrectable error count for SM bucket ++ unsigned long long aggregateUncBucketPcie; //!< aggregate uncorrectable error count for PCIE bucket ++ unsigned long long aggregateUncBucketMcu; //!< aggregate uncorrectable error count for Microcontroller bucket ++ unsigned long long aggregateUncBucketOther; //!< aggregate uncorrectable error count for Other bucket ++ unsigned int bThresholdExceeded; //!< if the error threshold of field diag is exceeded ++} nvmlEccSramErrorStatus_v1_t; ++ ++typedef nvmlEccSramErrorStatus_v1_t nvmlEccSramErrorStatus_t; ++#define nvmlEccSramErrorStatus_v1 NVML_STRUCT_VERSION(EccSramErrorStatus, 1) ++ ++/** ++ * GSP firmware ++ */ ++#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40 ++ ++/** ++ * Simplified chip architecture ++ */ ++#define NVML_DEVICE_ARCH_KEPLER 2 // Devices based on the NVIDIA Kepler architecture ++#define NVML_DEVICE_ARCH_MAXWELL 3 // Devices based on the NVIDIA Maxwell architecture ++#define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture ++#define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture ++#define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture ++#define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture ++#define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture ++#define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture ++ ++#define NVML_DEVICE_ARCH_BLACKWELL 10 // Devices based on the NVIDIA Blackwell architecture ++ ++#define NVML_DEVICE_ARCH_T23X 11 // Devices based on NVIDIA Orin architecture ++ ++#define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer ++ ++typedef unsigned int nvmlDeviceArchitecture_t; ++ ++/** ++ * PCI bus types ++ */ ++#define NVML_BUS_TYPE_UNKNOWN 0 ++#define NVML_BUS_TYPE_PCI 1 ++#define NVML_BUS_TYPE_PCIE 2 ++#define NVML_BUS_TYPE_FPCI 3 ++#define NVML_BUS_TYPE_AGP 4 ++ ++typedef unsigned int nvmlBusType_t; ++ ++/** ++ * Device Power Modes ++ */ ++ ++/** ++ * Device Fan control policy ++ */ ++#define NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW 0 ++#define NVML_FAN_POLICY_MANUAL 1 ++ ++typedef unsigned int nvmlFanControlPolicy_t; ++ ++/** ++ * Device Power Source ++ */ ++#define NVML_POWER_SOURCE_AC 0x00000000 ++#define NVML_POWER_SOURCE_BATTERY 0x00000001 ++#define NVML_POWER_SOURCE_UNDERSIZED 0x00000002 ++ ++typedef unsigned int nvmlPowerSource_t; ++ ++/* ++ * Device PCIE link Max Speed ++ */ ++#define NVML_PCIE_LINK_MAX_SPEED_INVALID 0x00000000 ++#define NVML_PCIE_LINK_MAX_SPEED_2500MBPS 0x00000001 ++#define NVML_PCIE_LINK_MAX_SPEED_5000MBPS 0x00000002 ++#define NVML_PCIE_LINK_MAX_SPEED_8000MBPS 0x00000003 ++#define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004 ++#define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005 ++#define NVML_PCIE_LINK_MAX_SPEED_64000MBPS 0x00000006 ++ ++/* ++ * Adaptive clocking status ++ */ ++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000 ++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED 0x00000001 ++ ++#define NVML_MAX_GPU_UTILIZATIONS 8 ++ ++/** ++ * Represents the GPU utilization domains ++ */ ++typedef enum nvmlGpuUtilizationDomainId_t ++{ ++ NVML_GPU_UTILIZATION_DOMAIN_GPU = 0, //!< Graphics engine domain ++ NVML_GPU_UTILIZATION_DOMAIN_FB = 1, //!< Frame buffer domain ++ NVML_GPU_UTILIZATION_DOMAIN_VID = 2, //!< Video engine domain ++ NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain ++} nvmlGpuUtilizationDomainId_t; ++ ++typedef struct nvmlGpuDynamicPstatesInfo_st ++{ ++ unsigned int flags; //!< Reserved for future use ++ struct ++ { ++ unsigned int bIsPresent; //!< Set if this utilization domain is present on this GPU ++ unsigned int percentage; //!< Percentage of time where the domain is considered busy in the last 1-second interval ++ unsigned int incThreshold; //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed ++ unsigned int decThreshold; //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed ++ } utilization[NVML_MAX_GPU_UTILIZATIONS]; ++} nvmlGpuDynamicPstatesInfo_t; ++ ++/* ++ * PCIe outbound/inbound atomic operations capability ++ */ ++#define NVML_PCIE_ATOMICS_CAP_FETCHADD32 0x01 ++#define NVML_PCIE_ATOMICS_CAP_FETCHADD64 0x02 ++#define NVML_PCIE_ATOMICS_CAP_SWAP32 0x04 ++#define NVML_PCIE_ATOMICS_CAP_SWAP64 0x08 ++#define NVML_PCIE_ATOMICS_CAP_CAS32 0x10 ++#define NVML_PCIE_ATOMICS_CAP_CAS64 0x20 ++#define NVML_PCIE_ATOMICS_CAP_CAS128 0x40 ++#define NVML_PCIE_ATOMICS_OPS_MAX 7 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @addtogroup virtualGPU vGPU Enums, Constants, Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** @defgroup nvmlVirtualGpuEnums vGPU Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/*! ++ * GPU virtualization mode types. ++ */ ++typedef enum nvmlGpuVirtualizationMode { ++ NVML_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU ++ NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthorugh ++ NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. ++ NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode ++ NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 //!< Device is associated with VGX hypervisor in vSGA mode ++} nvmlGpuVirtualizationMode_t; ++ ++/** ++ * Host vGPU modes ++ */ ++typedef enum nvmlHostVgpuMode_enum ++{ ++ NVML_HOST_VGPU_MODE_NON_SRIOV = 0, //!< Non SR-IOV mode ++ NVML_HOST_VGPU_MODE_SRIOV = 1 //!< SR-IOV mode ++} nvmlHostVgpuMode_t; ++ ++/*! ++ * Types of VM identifiers ++ */ ++typedef enum nvmlVgpuVmIdType { ++ NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID ++ NVML_VGPU_VM_ID_UUID = 1 //!< VM ID represents UUID ++} nvmlVgpuVmIdType_t; ++ ++/** ++ * vGPU GUEST info state ++ */ ++typedef enum nvmlVgpuGuestInfoState_enum ++{ ++ NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized ++ NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 //!< Guest-dependent fields initialized ++} nvmlVgpuGuestInfoState_t; ++ ++/** ++ * vGPU software licensable features ++ */ ++typedef enum { ++ NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0, //!< Unknown ++ NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU ++ NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2, //!< Nvidia RTX ++ NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX, //!< Deprecated, do not use. ++ NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3, //!< Gaming ++ NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 //!< Compute ++} nvmlGridLicenseFeatureCode_t; ++ ++/** ++ * Status codes for license expiry ++ */ ++#define NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE 0 //!< Expiry information not available ++#define NVML_GRID_LICENSE_EXPIRY_INVALID 1 //!< Invalid expiry or error fetching expiry ++#define NVML_GRID_LICENSE_EXPIRY_VALID 2 //!< Valid expiry ++#define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE 3 //!< Expiry not applicable ++#define NVML_GRID_LICENSE_EXPIRY_PERMANENT 4 //!< Permanent expiry ++ ++/** ++ * vGPU queryable capabilities ++ */ ++typedef enum nvmlVgpuCapability_enum ++{ ++ NVML_VGPU_CAP_NVLINK_P2P = 0, //!< P2P over NVLink is supported ++ NVML_VGPU_CAP_GPUDIRECT = 1, //!< GPUDirect capability is supported ++ NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2, //!< vGPU profile cannot be mixed with other vGPU profiles in same VM ++ NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3, //!< vGPU profile cannot run on a GPU alongside other profiles of different type ++ NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4, //!< vGPU profile cannot run on a GPU alongside other profiles of different size ++ // Keep this last ++ NVML_VGPU_CAP_COUNT ++} nvmlVgpuCapability_t; ++ ++/** ++* vGPU driver queryable capabilities ++*/ ++typedef enum nvmlVgpuDriverCapability_enum ++{ ++ NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0, //!< Supports mixing of different vGPU profiles within one guest VM ++ NVML_VGPU_DRIVER_CAP_WARM_UPDATE = 1, //!< Supports FSR and warm update of vGPU host driver without terminating the running guest VM ++ // Keep this last ++ NVML_VGPU_DRIVER_CAP_COUNT ++} nvmlVgpuDriverCapability_t; ++ ++/** ++* Device vGPU queryable capabilities ++*/ ++typedef enum nvmlDeviceVgpuCapability_enum ++{ ++ NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0, //!< Query if the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations ++ NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing types ++ NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes ++ NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3, //!< Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second ++ NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4, //!< Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second ++ NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING = 5, //!< Query if vGPU profiles on the GPU supports migration data streaming ++ NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU = 6, //!< Set/Get support for mini-quarter vGPU profiles ++ NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU = 7, //!< Set/Get support for compute media engine vGPU profiles ++ NVML_DEVICE_VGPU_CAP_WARM_UPDATE = 8, //!< Query if the GPU supports FSR and warm update ++ // Keep this last ++ NVML_DEVICE_VGPU_CAP_COUNT ++} nvmlDeviceVgpuCapability_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++ ++/** @defgroup nvmlVgpuConstants vGPU Constants ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense ++ */ ++#define NVML_GRID_LICENSE_BUFFER_SIZE 128 ++ ++#define NVML_VGPU_NAME_BUFFER_SIZE 64 ++ ++#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 ++ ++#define INVALID_GPU_INSTANCE_PROFILE_ID 0xFFFFFFFF ++ ++#define INVALID_GPU_INSTANCE_ID 0xFFFFFFFF ++ ++#define NVML_INVALID_VGPU_PLACEMENT_ID 0xFFFF ++ ++/*! ++ * Macros for vGPU instance's virtualization capabilities bitfield. ++ */ ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 ++ ++/*! ++ * Macros for pGPU's virtualization capabilities bitfield. ++ */ ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVgpuStructs vGPU Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++typedef unsigned int nvmlVgpuTypeId_t; ++ ++typedef unsigned int nvmlVgpuInstance_t; ++ ++/** ++ * Structure to store the vGPU heterogeneous mode of device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int mode; //!< The vGPU heterogeneous mode ++} nvmlVgpuHeterogeneousMode_v1_t; ++typedef nvmlVgpuHeterogeneousMode_v1_t nvmlVgpuHeterogeneousMode_t; ++#define nvmlVgpuHeterogeneousMode_v1 NVML_STRUCT_VERSION(VgpuHeterogeneousMode, 1) ++ ++/** ++ * Structure to store the placement ID of vGPU instance -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int placementId; //!< Placement ID of the active vGPU instance ++} nvmlVgpuPlacementId_v1_t; ++typedef nvmlVgpuPlacementId_v1_t nvmlVgpuPlacementId_t; ++#define nvmlVgpuPlacementId_v1 NVML_STRUCT_VERSION(VgpuPlacementId, 1) ++ ++/** ++ * Structure to store the list of vGPU placements -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int placementSize; //!< The number of slots occupied by the vGPU type ++ unsigned int count; //!< Count of placement IDs fetched ++ unsigned int *placementIds; //!< Placement IDs for the vGPU type ++} nvmlVgpuPlacementList_v1_t; ++typedef nvmlVgpuPlacementList_v1_t nvmlVgpuPlacementList_t; ++#define nvmlVgpuPlacementList_v1 NVML_STRUCT_VERSION(VgpuPlacementList, 1) ++ ++/** ++ * Structure to store BAR1 size information of vGPU type -- Version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned long long bar1Size; //!< BAR1 size in megabytes ++} nvmlVgpuTypeBar1Info_v1_t; ++typedef nvmlVgpuTypeBar1Info_v1_t nvmlVgpuTypeBar1Info_t; ++#define nvmlVgpuTypeBar1Info_v1 NVML_STRUCT_VERSION(VgpuTypeBar1Info, 1) ++ ++/** ++ * Structure to store Utilization Value and vgpuInstance ++ */ ++typedef struct nvmlVgpuInstanceUtilizationSample_st ++{ ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value ++ nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value ++ nvmlValue_t encUtil; //!< Encoder Util Value ++ nvmlValue_t decUtil; //!< Decoder Util Value ++} nvmlVgpuInstanceUtilizationSample_t; ++ ++/** ++ * Structure to store Utilization Value and vgpuInstance Info -- Version 1 ++ */ ++typedef struct ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value ++ nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value ++ nvmlValue_t encUtil; //!< Encoder Util Value ++ nvmlValue_t decUtil; //!< Decoder Util Value ++ nvmlValue_t jpgUtil; //!< Jpeg Util Value ++ nvmlValue_t ofaUtil; //!< Ofa Util Value ++} nvmlVgpuInstanceUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store recent utilization for vGPU instances running on a device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ nvmlValueType_t sampleValType; //!< Hold the type of returned sample values ++ unsigned int vgpuInstanceCount; //!< Hold the number of vGPU instances ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlVgpuInstanceUtilizationInfo_v1_t *vgpuUtilArray; //!< The array (allocated by caller) in which vGPU utilization are returned ++} nvmlVgpuInstancesUtilizationInfo_v1_t; ++typedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t; ++#define nvmlVgpuInstancesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuInstancesUtilizationInfo, 1) ++ ++/** ++ * Structure to store Utilization Value, vgpuInstance and subprocess information ++ */ ++typedef struct nvmlVgpuProcessUtilizationSample_st ++{ ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned int pid; //!< PID of process running within the vGPU VM ++ char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++} nvmlVgpuProcessUtilizationSample_t; ++ ++/** ++ * Structure to store Utilization Value, vgpuInstance and subprocess information for process running on vGPU instance -- version 1 ++ */ ++typedef struct ++{ ++ char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned int pid; //!< PID of process running within the vGPU VM ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++ unsigned int jpgUtil; //!< Jpeg Util Value ++ unsigned int ofaUtil; //!< Ofa Util Value ++} nvmlVgpuProcessUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store recent utilization, vgpuInstance and subprocess information for processes running on vGPU instances active on a device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int vgpuProcessCount; //!< Hold the number of processes running on vGPU instances ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlVgpuProcessUtilizationInfo_v1_t *vgpuProcUtilArray; //!< The array (allocated by caller) in which utilization of processes running on vGPU instances are returned ++} nvmlVgpuProcessesUtilizationInfo_v1_t; ++typedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t; ++#define nvmlVgpuProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuProcessesUtilizationInfo, 1) ++ ++/** ++ * vGPU scheduler policies ++ */ ++#define NVML_VGPU_SCHEDULER_POLICY_UNKNOWN 0 ++#define NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT 1 ++#define NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE 2 ++#define NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE 3 ++ ++#define NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT 3 ++ ++#define NVML_SCHEDULER_SW_MAX_LOG_ENTRIES 200 ++ ++#define NVML_VGPU_SCHEDULER_ARR_DEFAULT 0 ++#define NVML_VGPU_SCHEDULER_ARR_DISABLE 1 ++#define NVML_VGPU_SCHEDULER_ARR_ENABLE 2 ++ ++/** ++ * Union to represent the vGPU Scheduler Parameters ++ */ ++typedef union ++{ ++ struct ++ { ++ unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode ++ unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise ++ } vgpuSchedDataWithARR; ++ ++ struct ++ { ++ unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise ++ } vgpuSchedData; ++ ++} nvmlVgpuSchedulerParams_t; ++ ++/** ++ * Structure to store the state and logs of a software runlist ++ */ ++typedef struct nvmlVgpuSchedulerLogEntries_st ++{ ++ unsigned long long timestamp; //!< Timestamp in ns when this software runlist was preeempted ++ unsigned long long timeRunTotal; //!< Total time in ns this software runlist has run ++ unsigned long long timeRun; //!< Time in ns this software runlist ran before preemption ++ unsigned int swRunlistId; //!< Software runlist Id ++ unsigned long long targetTimeSlice; //!< The actual timeslice after deduction ++ unsigned long long cumulativePreemptionTime; //!< Preemption time in ns for this SW runlist ++} nvmlVgpuSchedulerLogEntry_t; ++ ++/** ++ * Structure to store a vGPU software scheduler log ++ */ ++typedef struct nvmlVgpuSchedulerLog_st ++{ ++ unsigned int engineId; //!< Engine whose software runlist log entries are fetched ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. ++ nvmlVgpuSchedulerParams_t schedulerParams; ++ unsigned int entriesCount; //!< Count of log entries fetched ++ nvmlVgpuSchedulerLogEntry_t logEntries[NVML_SCHEDULER_SW_MAX_LOG_ENTRIES]; ++} nvmlVgpuSchedulerLog_t; ++ ++/** ++ * Structure to store the vGPU scheduler state ++ */ ++typedef struct nvmlVgpuSchedulerGetState_st ++{ ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. ++ nvmlVgpuSchedulerParams_t schedulerParams; ++} nvmlVgpuSchedulerGetState_t; ++ ++/** ++ * Union to represent the vGPU Scheduler set Parameters ++ */ ++typedef union ++{ ++ struct ++ { ++ unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode ++ unsigned int frequency; //!< Frequency for Adaptive Round Robin mode ++ } vgpuSchedDataWithARR; ++ ++ struct ++ { ++ unsigned int timeslice; //!< The timeslice in ns(Nanoseconds) for each software run list as configured, or the default value otherwise ++ } vgpuSchedData; ++ ++} nvmlVgpuSchedulerSetParams_t; ++ ++/** ++ * Structure to set the vGPU scheduler state ++ */ ++typedef struct nvmlVgpuSchedulerSetState_st ++{ ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int enableARRMode; //!< Adaptive Round Robin scheduler ++ nvmlVgpuSchedulerSetParams_t schedulerParams; ++} nvmlVgpuSchedulerSetState_t; ++ ++/** ++ * Structure to store the vGPU scheduler capabilities ++ */ ++typedef struct nvmlVgpuSchedulerCapabilities_st ++{ ++ unsigned int supportedSchedulers[NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT]; //!< List the supported vGPU schedulers on the device ++ unsigned int maxTimeslice; //!< Maximum timeslice value in ns ++ unsigned int minTimeslice; //!< Minimum timeslice value in ns ++ unsigned int isArrModeSupported; //!< Flag to check Adaptive Round Robin mode enabled/disabled. ++ unsigned int maxFrequencyForARR; //!< Maximum frequency for Adaptive Round Robin mode ++ unsigned int minFrequencyForARR; //!< Minimum frequency for Adaptive Round Robin mode ++ unsigned int maxAvgFactorForARR; //!< Maximum averaging factor for Adaptive Round Robin mode ++ unsigned int minAvgFactorForARR; //!< Minimum averaging factor for Adaptive Round Robin mode ++} nvmlVgpuSchedulerCapabilities_t; ++ ++/** ++ * Structure to store the vGPU license expiry details ++ */ ++typedef struct nvmlVgpuLicenseExpiry_st ++{ ++ unsigned int year; //!< Year of license expiry ++ unsigned short month; //!< Month of license expiry ++ unsigned short day; //!< Day of license expiry ++ unsigned short hour; //!< Hour of license expiry ++ unsigned short min; //!< Minutes of license expiry ++ unsigned short sec; //!< Seconds of license expiry ++ unsigned char status; //!< License expiry status ++} nvmlVgpuLicenseExpiry_t; ++ ++/** ++ * vGPU license state ++ */ ++#define NVML_GRID_LICENSE_STATE_UNKNOWN 0 //!< Unknown state ++#define NVML_GRID_LICENSE_STATE_UNINITIALIZED 1 //!< Uninitialized state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED 2 //!< Unlicensed unrestricted state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED 3 //!< Unlicensed restricted state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED 4 //!< Unlicensed state ++#define NVML_GRID_LICENSE_STATE_LICENSED 5 //!< Licensed state ++ ++typedef struct nvmlVgpuLicenseInfo_st ++{ ++ unsigned char isLicensed; //!< License status ++ nvmlVgpuLicenseExpiry_t licenseExpiry; //!< License expiry information ++ unsigned int currentState; //!< Current license state ++} nvmlVgpuLicenseInfo_t; ++ ++/** ++ * Structure to store license expiry date and time values ++ */ ++typedef struct nvmlGridLicenseExpiry_st ++{ ++ unsigned int year; //!< Year value of license expiry ++ unsigned short month; //!< Month value of license expiry ++ unsigned short day; //!< Day value of license expiry ++ unsigned short hour; //!< Hour value of license expiry ++ unsigned short min; //!< Minutes value of license expiry ++ unsigned short sec; //!< Seconds value of license expiry ++ unsigned char status; //!< License expiry status ++} nvmlGridLicenseExpiry_t; ++ ++/** ++ * Structure containing vGPU software licensable feature information ++ */ ++typedef struct nvmlGridLicensableFeature_st ++{ ++ nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code ++ unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero ++ char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Deprecated. ++ char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Product name of feature ++ unsigned int featureEnabled; //!< Non-zero if feature is enabled, otherwise zero ++ nvmlGridLicenseExpiry_t licenseExpiry; //!< License expiry structure containing date and time ++} nvmlGridLicensableFeature_t; ++ ++/** ++ * Structure to store vGPU software licensable features ++ */ ++typedef struct nvmlGridLicensableFeatures_st ++{ ++ int isGridLicenseSupported; //!< Non-zero if vGPU Software Licensing is supported on the system, otherwise zero ++ unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array ++ nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. ++} nvmlGridLicensableFeatures_t; ++ ++/** @} */ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFieldValueEnums Field Value Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Field Identifiers. ++ * ++ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. ++ */ ++#define NVML_FI_DEV_ECC_CURRENT 1 //!< Current ECC mode. 1=Active. 0=Inactive ++#define NVML_FI_DEV_ECC_PENDING 2 //!< Pending ECC mode. 1=Active. 0=Inactive ++/* ECC Count Totals */ ++#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL 3 //!< Total single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL 4 //!< Total double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL 5 //!< Total single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL 6 //!< Total double bit aggregate (persistent) ECC errors ++/* Individual ECC locations */ ++#define NVML_FI_DEV_ECC_SBE_VOL_L1 7 //!< L1 cache single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_L1 8 //!< L1 cache double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_L2 9 //!< L2 cache single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_L2 10 //!< L2 cache double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_DEV 11 //!< Device memory single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_DEV 12 //!< Device memory double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_REG 13 //!< Register file single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_REG 14 //!< Register file double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_TEX 15 //!< Texture memory single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_TEX 16 //!< Texture memory double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_CBU 17 //!< CBU double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_L1 18 //!< L1 cache single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_L1 19 //!< L1 cache double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_L2 20 //!< L2 cache single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_L2 21 //!< L2 cache double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_DEV 22 //!< Device memory single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_DEV 23 //!< Device memory double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_REG 24 //!< Register File single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_REG 25 //!< Register File double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_TEX 26 //!< Texture memory single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_TEX 27 //!< Texture memory double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_CBU 28 //!< CBU double bit aggregate ECC errors ++ ++/* Page Retirement */ ++#define NVML_FI_DEV_RETIRED_SBE 29 //!< Number of retired pages because of single bit errors ++#define NVML_FI_DEV_RETIRED_DBE 30 //!< Number of retired pages because of double bit errors ++#define NVML_FI_DEV_RETIRED_PENDING 31 //!< If any pages are pending retirement. 1=yes. 0=no. ++ ++/* NvLink Flit Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 32 //!< NVLink flow control CRC Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 33 //!< NVLink flow control CRC Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 34 //!< NVLink flow control CRC Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 35 //!< NVLink flow control CRC Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 36 //!< NVLink flow control CRC Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 37 //!< NVLink flow control CRC Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC Error Counter total for all Lanes ++ ++/* NvLink CRC Data Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 39 //!< NVLink data CRC Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 40 //!< NVLink data CRC Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 41 //!< NVLink data CRC Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 42 //!< NVLink data CRC Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 43 //!< NVLink data CRC Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 44 //!< NVLink data CRC Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes ++ ++/* NvLink Replay Error Counters */ ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 46 //!< NVLink Replay Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 47 //!< NVLink Replay Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 48 //!< NVLink Replay Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 49 //!< NVLink Replay Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 50 //!< NVLink Replay Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 51 //!< NVLink Replay Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 52 //!< NVLink Replay Error Counter total for all Lanes ++ ++/* NvLink Recovery Error Counters */ ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 53 //!< NVLink Recovery Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 54 //!< NVLink Recovery Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 55 //!< NVLink Recovery Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 56 //!< NVLink Recovery Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 57 //!< NVLink Recovery Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 58 //!< NVLink Recovery Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes ++ ++/* NvLink Bandwidth Counters */ ++/* ++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. ++ * Please use the following field values instead: ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX ++ */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL 66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes ++ ++/* NvLink Bandwidth Counters */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL 73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes ++ ++/* NVML Perf Policy Counters */ ++#define NVML_FI_DEV_PERF_POLICY_POWER 74 //!< Perf Policy Counter for Power Policy ++#define NVML_FI_DEV_PERF_POLICY_THERMAL 75 //!< Perf Policy Counter for Thermal Policy ++#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST 76 //!< Perf Policy Counter for Sync boost Policy ++#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT 77 //!< Perf Policy Counter for Board Limit ++#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION 78 //!< Perf Policy Counter for Low GPU Utilization Policy ++#define NVML_FI_DEV_PERF_POLICY_RELIABILITY 79 //!< Perf Policy Counter for Reliability Policy ++#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS 80 //!< Perf Policy Counter for Total App Clock Policy ++#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS 81 //!< Perf Policy Counter for Total Base Clocks Policy ++ ++/* Memory temperatures */ ++#define NVML_FI_DEV_MEMORY_TEMP 82 //!< Memory temperature for the device ++ ++/* Energy Counter */ ++#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded ++ ++/* NVLink Speed */ ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 84 //!< NVLink Speed in MBps for Link 0 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 85 //!< NVLink Speed in MBps for Link 1 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 86 //!< NVLink Speed in MBps for Link 2 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 87 //!< NVLink Speed in MBps for Link 3 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 88 //!< NVLink Speed in MBps for Link 4 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 89 //!< NVLink Speed in MBps for Link 5 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links ++ ++#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device ++ ++#define NVML_FI_DEV_RETIRED_PENDING_SBE 92 //!< If any pages are pending retirement due to SBE. 1=yes. 0=no. ++#define NVML_FI_DEV_RETIRED_PENDING_DBE 93 //!< If any pages are pending retirement due to DBE. 1=yes. 0=no. ++ ++#define NVML_FI_DEV_PCIE_REPLAY_COUNTER 94 //!< PCIe replay counter ++#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER 95 //!< PCIe replay rollover counter ++ ++/* NvLink Flit Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 96 //!< NVLink flow control CRC Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 97 //!< NVLink flow control CRC Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 98 //!< NVLink flow control CRC Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 99 //!< NVLink flow control CRC Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 100 //!< NVLink flow control CRC Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 101 //!< NVLink flow control CRC Error Counter for Lane 11 ++ ++/* NvLink CRC Data Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 102 //!< NVLink data CRC Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 103 //!< NVLink data CRC Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 104 //!< NVLink data CRC Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 105 //!< NVLink data CRC Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 106 //!< NVLink data CRC Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 107 //!< NVLink data CRC Error Counter for Lane 11 ++ ++/* NvLink Replay Error Counters */ ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 108 //!< NVLink Replay Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 109 //!< NVLink Replay Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 110 //!< NVLink Replay Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 111 //!< NVLink Replay Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 112 //!< NVLink Replay Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 113 //!< NVLink Replay Error Counter for Lane 11 ++ ++/* NvLink Recovery Error Counters */ ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 114 //!< NVLink Recovery Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 115 //!< NVLink Recovery Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 116 //!< NVLink Recovery Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 117 //!< NVLink Recovery Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 118 //!< NVLink Recovery Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 119 //!< NVLink Recovery Error Counter for Lane 11 ++ ++/* NvLink Bandwidth Counters */ ++/* ++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. ++ * Please use the following field values instead: ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX ++ */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11 ++ ++/* NvLink Bandwidth Counters */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11 ++ ++/* NVLink Speed */ ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 132 //!< NVLink Speed in MBps for Link 6 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 133 //!< NVLink Speed in MBps for Link 7 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 134 //!< NVLink Speed in MBps for Link 8 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 135 //!< NVLink Speed in MBps for Link 9 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 136 //!< NVLink Speed in MBps for Link 10 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 137 //!< NVLink Speed in MBps for Link 11 ++ ++/** ++ * NVLink throughput counters field values ++ * ++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ * A scopeId of UINT_MAX returns aggregate value summed up across all links ++ * for the specified counter type in fieldId. ++ */ ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX 138 //!< NVLink TX Data throughput in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX 139 //!< NVLink RX Data throughput in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 //!< NVLink TX Data + protocol overhead in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 //!< NVLink RX Data + protocol overhead in KiB ++ ++/* Row Remapper */ ++#define NVML_FI_DEV_REMAPPED_COR 142 //!< Number of remapped rows due to correctable errors ++#define NVML_FI_DEV_REMAPPED_UNC 143 //!< Number of remapped rows due to uncorrectable errors ++#define NVML_FI_DEV_REMAPPED_PENDING 144 //!< If any rows are pending remapping. 1=yes 0=no ++#define NVML_FI_DEV_REMAPPED_FAILURE 145 //!< If any rows failed to be remapped 1=yes 0=no ++ ++/** ++ * Remote device NVLink ID ++ * ++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ */ ++#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID ++ ++/** ++ * NVSwitch: connected NVLink count ++ */ ++#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch ++ ++/* NvLink ECC Data Error Counters ++ * ++ * Lane ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ * ++ */ ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 148 //!< NVLink data ECC Error Counter for Link 0 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 149 //!< NVLink data ECC Error Counter for Link 1 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 150 //!< NVLink data ECC Error Counter for Link 2 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 151 //!< NVLink data ECC Error Counter for Link 3 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 152 //!< NVLink data ECC Error Counter for Link 4 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 153 //!< NVLink data ECC Error Counter for Link 5 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 154 //!< NVLink data ECC Error Counter for Link 6 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 155 //!< NVLink data ECC Error Counter for Link 7 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 156 //!< NVLink data ECC Error Counter for Link 8 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 157 //!< NVLink data ECC Error Counter for Link 9 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 158 //!< NVLink data ECC Error Counter for Link 10 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NVLink data ECC Error Counter total for all Links ++ ++#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 //!< NVLink Replay Error Counter ++#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 //!< NVLink Recovery Error Counter ++#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 //!< NVLink CRC Error Counter ++#define NVML_FI_DEV_NVLINK_GET_SPEED 164 //!< NVLink Speed in MBps ++#define NVML_FI_DEV_NVLINK_GET_STATE 165 //!< NVLink State - Active,Inactive ++#define NVML_FI_DEV_NVLINK_GET_VERSION 166 //!< NVLink Version ++ ++#define NVML_FI_DEV_NVLINK_GET_POWER_STATE 167 //!< NVLink Power state. 0=HIGH_SPEED 1=LOW_SPEED ++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD 168 //!< NVLink length of idle period (units can be found from ++ // NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS) before ++ // transitioning links to sleep state ++ ++#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER 169 //!< Device PEX error recovery counter ++ ++#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device ++#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE ++#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links ++ ++#define NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS 173 //!< PCIe Correctable Errors Counter ++#define NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED 174 //!< PCIe NAK Receive Counter ++#define NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR 175 //!< PCIe Receiver Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_BAD_TLP 176 //!< PCIe Bad TLP Counter ++#define NVML_FI_DEV_PCIE_COUNT_NAKS_SENT 177 //!< PCIe NAK Send Counter ++#define NVML_FI_DEV_PCIE_COUNT_BAD_DLLP 178 //!< PCIe Bad DLLP Counter ++#define NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR 179 //!< PCIe Non Fatal Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR 180 //!< PCIe Fatal Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ 181 //!< PCIe Unsupported Request Counter ++#define NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR 182 //!< PCIe LCRC Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_LANE_ERROR 183 //!< PCIe Per Lane Error Counter. ++ ++#define NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED 184 //!< Device's Restless MIG Capability ++ ++/** ++ * Retrieves power usage for this GPU in milliwatts. ++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and ++ * \ref nvmlDeviceGetPowerUsage. ++ * ++ * scopeId needs to be specified. It signifies: ++ * 0 - GPU Only Scope - Metrics for GPU are retrieved ++ * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved. ++ * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported ++ */ ++#define NVML_FI_DEV_POWER_AVERAGE 185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures. ++#define NVML_FI_DEV_POWER_INSTANT 186 //!< Current GPU power, supported on all architectures. ++#define NVML_FI_DEV_POWER_MIN_LIMIT 187 //!< Minimum power limit in milliwatts. ++#define NVML_FI_DEV_POWER_MAX_LIMIT 188 //!< Maximum power limit in milliwatts. ++#define NVML_FI_DEV_POWER_DEFAULT_LIMIT 189 //!< Default power limit in milliwatts (limit which device boots with). ++#define NVML_FI_DEV_POWER_CURRENT_LIMIT 190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band). ++#define NVML_FI_DEV_ENERGY 191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU. ++#define NVML_FI_DEV_POWER_REQUESTED_LIMIT 192 //!< Power limit requested by NVML or any other userspace client. ++ ++/** ++ * GPU T.Limit temperature thresholds in degree Celsius ++ * ++ * These fields are supported on Ada and later architectures and supersedes \ref nvmlDeviceGetTemperatureThreshold. ++ */ ++#define NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT 193 //!< T.Limit temperature after which GPU may shut down for HW protection ++#define NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT 194 //!< T.Limit temperature after which GPU may begin HW slowdown ++#define NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT 195 //!< T.Limit temperature after which GPU may begin SW slowdown due to memory temperature ++#define NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT 196 //!< T.Limit temperature after which GPU may be throttled below base clock ++ ++#define NVML_FI_DEV_PCIE_COUNT_TX_BYTES 197 //!< PCIe transmit bytes. Value can be wrapped. ++#define NVML_FI_DEV_PCIE_COUNT_RX_BYTES 198 //!< PCIe receive bytes. Value can be wrapped. ++ ++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX 199 //!< Max Nvlink Power Threshold. See NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD ++ ++#define NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE 200 //!< MIG mode independent, MIG query capable device. 1=yes. 0=no. ++ ++#define NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS 201 //!usedGpuMemory is not supported ++ ++ ++ unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if ++ //!< the process is not terminated ++ ++ unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process ++ ++ unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) ++ ++ unsigned int reserved[5]; //!< Reserved for future use ++} nvmlAccountingStats_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlEncoderStructs Encoder Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Represents type of encoder for capacity can be queried ++ */ ++typedef enum nvmlEncoderQueryType_enum ++{ ++ NVML_ENCODER_QUERY_H264 = 0x00, //!< H264 encoder ++ NVML_ENCODER_QUERY_HEVC = 0x01, //!< HEVC encoder ++ NVML_ENCODER_QUERY_AV1 = 0x02, //!< AV1 encoder ++ NVML_ENCODER_QUERY_UNKNOWN = 0xFF //!< Unknown encoder ++}nvmlEncoderType_t; ++ ++/** ++ * Structure to hold encoder session data ++ */ ++typedef struct nvmlEncoderSessionInfo_st ++{ ++ unsigned int sessionId; //!< Unique session ID ++ unsigned int pid; //!< Owning process ID ++ nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) ++ nvmlEncoderType_t codecType; //!< Video encoder type ++ unsigned int hResolution; //!< Current encode horizontal resolution ++ unsigned int vResolution; //!< Current encode vertical resolution ++ unsigned int averageFps; //!< Moving average encode frames per second ++ unsigned int averageLatency; //!< Moving average encode latency in microseconds ++}nvmlEncoderSessionInfo_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures ++* @{ ++*/ ++/***************************************************************************************************/ ++ ++/** ++ * Represents frame buffer capture session type ++ */ ++typedef enum nvmlFBCSessionType_enum ++{ ++ NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown ++ NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys ++ NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda ++ NVML_FBC_SESSION_TYPE_VID, //!< Vid ++ NVML_FBC_SESSION_TYPE_HWENC //!< HEnc ++} nvmlFBCSessionType_t; ++ ++/** ++ * Structure to hold frame buffer capture sessions stats ++ */ ++typedef struct nvmlFBCStats_st ++{ ++ unsigned int sessionsCount; //!< Total no of sessions ++ unsigned int averageFPS; //!< Moving average new frames captured per second ++ unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds ++} nvmlFBCStats_t; ++ ++#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. ++#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. ++ ++/** ++ * Structure to hold FBC session data ++ */ ++typedef struct nvmlFBCSessionInfo_st ++{ ++ unsigned int sessionId; //!< Unique session ID ++ unsigned int pid; //!< Owning process ID ++ nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) ++ unsigned int displayOrdinal; //!< Display identifier ++ nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session ++ unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). ++ unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session ++ unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session ++ unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call ++ unsigned int vResolution; //!< Vertical resolution requested by caller in capture call ++ unsigned int averageFPS; //!< Moving average new frames captured per second ++ unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds ++} nvmlFBCSessionInfo_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDrainDefs Drain State definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() ++ */ ++typedef enum nvmlDetachGpuState_enum ++{ ++ NVML_DETACH_GPU_KEEP = 0, ++ NVML_DETACH_GPU_REMOVE ++} nvmlDetachGpuState_t; ++ ++/** ++ * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() ++ */ ++typedef enum nvmlPcieLinkState_enum ++{ ++ NVML_PCIE_LINK_KEEP = 0, ++ NVML_PCIE_LINK_SHUT_DOWN ++} nvmlPcieLinkState_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlConfidentialComputingDefs Confidential Computing definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** ++ * Confidential Compute CPU Capabilities values ++ */ ++#define NVML_CC_SYSTEM_CPU_CAPS_NONE 0 ++#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV 1 ++#define NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX 2 ++ ++/** ++ * Confidenial Compute GPU Capabilities values ++ */ ++#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0 ++#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE 1 ++ ++typedef struct nvmlConfComputeSystemCaps_st { ++ unsigned int cpuCaps; ++ unsigned int gpusCaps; ++} nvmlConfComputeSystemCaps_t; ++ ++/** ++ * Confidential Compute DevTools Mode values ++ */ ++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF 0 ++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_ON 1 ++ ++/** ++ * Confidential Compute Environment values ++ */ ++#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0 ++#define NVML_CC_SYSTEM_ENVIRONMENT_SIM 1 ++#define NVML_CC_SYSTEM_ENVIRONMENT_PROD 2 ++ ++/** ++ * Confidential Compute Feature Status values ++ */ ++#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 ++#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 ++ ++typedef struct nvmlConfComputeSystemState_st { ++ unsigned int environment; ++ unsigned int ccFeature; ++ unsigned int devToolsMode; ++} nvmlConfComputeSystemState_t; ++ ++/** ++ * Confidential Compute Multigpu mode values ++ */ ++#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 ++#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 ++ ++/** ++ * Confidential Compute System settings ++ */ ++typedef struct { ++ unsigned int version; ++ unsigned int environment; ++ unsigned int ccFeature; ++ unsigned int devToolsMode; ++ unsigned int multiGpuMode; ++} nvmlSystemConfComputeSettings_v1_t; ++ ++typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t; ++#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1) ++ ++/** ++ * Protected memory size ++ */ ++typedef struct ++nvmlConfComputeMemSizeInfo_st ++{ ++ unsigned long long protectedMemSizeKib; ++ unsigned long long unprotectedMemSizeKib; ++} nvmlConfComputeMemSizeInfo_t; ++ ++/** ++ * Confidential Compute GPUs/System Ready State values ++ */ ++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0 ++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE 1 ++ ++/** ++ * GPU Certificate Details ++ */ ++#define NVML_GPU_CERT_CHAIN_SIZE 0x1000 ++#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1400 ++ ++typedef struct nvmlConfComputeGpuCertificate_st { ++ unsigned int certChainSize; ++ unsigned int attestationCertChainSize; ++ unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE]; ++ unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE]; ++} nvmlConfComputeGpuCertificate_t; ++ ++/** ++ * GPU Attestation Report ++ */ ++#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20 ++#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000 ++#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000 ++#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0 ++#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1 ++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN 50 ++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX 75 ++ ++typedef struct nvmlConfComputeGpuAttestationReport_st { ++ unsigned int isCecAttestationReportPresent; ++ unsigned int attestationReportSize; ++ unsigned int cecAttestationReportSize; ++ unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE]; ++ unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE]; ++ unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE]; ++} nvmlConfComputeGpuAttestationReport_t; ++ ++typedef struct nvmlConfComputeSetKeyRotationThresholdInfo_st { ++ unsigned int version; ++ unsigned long long maxAttackerAdvantage; ++} nvmlConfComputeSetKeyRotationThresholdInfo_v1_t; ++ ++typedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t nvmlConfComputeSetKeyRotationThresholdInfo_t; ++#define nvmlConfComputeSetKeyRotationThresholdInfo_v1 \ ++ NVML_STRUCT_VERSION(ConfComputeSetKeyRotationThresholdInfo, 1) ++ ++typedef struct nvmlConfComputeGetKeyRotationThresholdInfo_st { ++ unsigned int version; ++ unsigned long long attackerAdvantage; ++} nvmlConfComputeGetKeyRotationThresholdInfo_v1_t; ++ ++typedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRotationThresholdInfo_t; ++#define nvmlConfComputeGetKeyRotationThresholdInfo_v1 \ ++ NVML_STRUCT_VERSION(ConfComputeGetKeyRotationThresholdInfo, 1) ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFabricDefs Fabric definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++#define NVML_GPU_FABRIC_UUID_LEN 16 ++ ++#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 ++#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 ++#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 ++#define NVML_GPU_FABRIC_STATE_COMPLETED 3 ++ ++typedef unsigned char nvmlGpuFabricState_t; ++ ++/** ++ * Contains the device fabric information ++ */ ++typedef struct { ++ unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs ++ nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". ++ unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs ++ nvmlGpuFabricState_t state; //!< Current state of GPU registration process ++} nvmlGpuFabricInfo_t; ++ ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 ++ ++#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 ++#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11 ++ ++/** ++ * GPU Fabric Health Status Mask for various fields can be obtained ++ * using the below macro. ++ * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW) ++ */ ++#define NVML_GPU_FABRIC_HEALTH_GET(var, type) \ ++ (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \ ++ (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type)) ++ ++/** ++ * GPU Fabric Health Status Mask for various fields can be tested ++ * using the below macro. ++ * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE) ++ */ ++#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \ ++ (NVML_GPU_FABRIC_HEALTH_GET(var, type) == \ ++ NVML_GPU_FABRIC_HEALTH_MASK##type##val) ++ ++/** ++* GPU Fabric information (v2). ++* ++* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field ++* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask ++* field to the end. This structure is not backwards-compatible with ++* \ref nvmlGpuFabricInfo_t. ++*/ ++typedef struct { ++ unsigned int version; //!< Structure version identifier (set to \p nvmlGpuFabricInfo_v2) ++ unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs ++ nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". ++ unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs ++ nvmlGpuFabricState_t state; //!< Current state of GPU registration process ++ unsigned int healthMask; //!< GPU Fabric health Status Mask ++} nvmlGpuFabricInfo_v2_t; ++ ++typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; ++ ++/** ++* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. ++*/ ++#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) ++ ++/** ++ * Device Scope - This is useful to retrieve the telemetry at GPU and module (e.g. GPU + CPU) level ++ */ ++#define NVML_POWER_SCOPE_GPU 0U //!< Targets only GPU ++#define NVML_POWER_SCOPE_MODULE 1U //!< Targets the whole module ++#define NVML_POWER_SCOPE_MEMORY 2U //!< Targets the GPU Memory ++ ++typedef unsigned char nvmlPowerScopeType_t; ++ ++/** ++ * Contains the power management limit ++ */ ++typedef struct ++{ ++ unsigned int version; //!< Structure format version (must be 1) ++ nvmlPowerScopeType_t powerScope; //!< [in] Device type: GPU or Total Module ++ unsigned int powerValueMw; //!< [out] Power value to retrieve or set in milliwatts ++} nvmlPowerValue_v2_t; ++ ++#define nvmlPowerValue_v2 NVML_STRUCT_VERSION(PowerValue, 2) ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup ++ * This chapter describes the methods that handle NVML initialization and cleanup. ++ * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and ++ * nvmlShutdown() once NVML is no longer being used. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found ++#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs ++ ++/** ++ * Initialize NVML, but don't initialize any GPUs yet. ++ * ++ * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values ++ * modifying the behaviour of nvmlInit(). ++ * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that ++ * did initialize all GPU devices in the system. ++ * ++ * This allows NVML to communicate with a GPU ++ * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are ++ * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. ++ * ++ * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in ++ * a bad or unstable state. ++ * ++ * For all products. ++ * ++ * This method, should be called once before invoking any other methods in the library. ++ * A reference count of the number of initializations is maintained. Shutdown only occurs ++ * when the reference count reaches zero. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly initialized ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running ++ * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlInit_v2(void); ++ ++/** ++ * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values ++ * modifying the behaviour of nvmlInit(). ++ * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. ++ * ++ * For all products. ++ * ++ * @param flags behaviour modifier flags ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly initialized ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running ++ * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); ++ ++/** ++ * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). ++ * ++ * For all products. ++ * ++ * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() ++ * A reference count of the number of initializations is maintained. Shutdown only occurs ++ * when the reference count reaches zero. For backwards compatibility, no error is reported if ++ * nvmlShutdown() is called more times than nvmlInit(). ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly shut down ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlShutdown(void); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlErrorReporting Error reporting ++ * This chapter describes helper functions for error reporting routines. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Helper method for converting NVML error codes into readable strings. ++ * ++ * For all products. ++ * ++ * @param result NVML error code to convert ++ * ++ * @return String representation of the error. ++ * ++ */ ++const DECLDIR char* nvmlErrorString(nvmlReturn_t result); ++/** @} */ ++ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlConstants Constants ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion ++ */ ++#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 ++ ++/** ++ * Buffer size guaranteed to be large enough for storing GPU identifiers. ++ */ ++#define NVML_DEVICE_UUID_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID ++ */ ++#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber ++ */ ++#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion ++ */ ++#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion ++ */ ++#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for storing GPU device names. ++ */ ++#define NVML_DEVICE_NAME_BUFFER_SIZE 64 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName ++ */ ++#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial ++ */ ++#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion ++ */ ++#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlSystemQueries System Queries ++ * This chapter describes the queries that NVML can perform against the local system. These queries ++ * are not device-specific. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves the version of the system's graphics driver. ++ * ++ * For all products. ++ * ++ * The version identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * @param version Reference in which to return the version identifier ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); ++ ++/** ++ * Retrieves the version of the NVML library. ++ * ++ * For all products. ++ * ++ * The version identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. ++ * ++ * @param version Reference in which to return the version identifier ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); ++ ++/** ++ * Retrieves the version of the CUDA driver. ++ * ++ * For all products. ++ * ++ * The CUDA driver version returned will be retreived from the currently installed version of CUDA. ++ * If the cuda library is not found, this function will return a known supported version number. ++ * ++ * @param cudaDriverVersion Reference in which to return the version identifier ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); ++ ++/** ++ * Retrieves the version of the CUDA driver from the shared library. ++ * ++ * For all products. ++ * ++ * The returned CUDA driver version by calling cuDriverGetVersion() ++ * ++ * @param cudaDriverVersion Reference in which to return the version identifier ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL ++ * - \ref NVML_ERROR_LIBRARY_NOT_FOUND if \a libcuda.so.1 or libcuda.dll is not found ++ * - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); ++ ++/** ++ * Macros for converting the CUDA driver version number to Major and Minor version numbers. ++ */ ++#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000) ++#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10) ++ ++/** ++ * Gets name of the process with provided process id ++ * ++ * For all products. ++ * ++ * Returned process name is cropped to provided length. ++ * name string is encoded in ANSI. ++ * ++ * @param pid The identifier of the process ++ * @param name Reference in which to return the process name ++ * @param length The maximum allowed length of the string returned in \a name ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); ++ ++/** ++ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. ++ * ++ * For S-class products. ++ * ++ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. ++ * The HIC must be connected to an S-class system for it to be reported by this function. ++ * ++ * @param hwbcCount Size of hwbcEntries array ++ * @param hwbcEntries Array holding information about hwbc ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); ++ ++/** ++ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param cpuNumber The CPU number ++ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray ++ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count ++ * number of device handles. ++ * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); ++ ++/** ++ * Structure to store Driver branch information ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ char branch[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< driver branch ++} nvmlSystemDriverBranchInfo_v1_t; ++typedef nvmlSystemDriverBranchInfo_v1_t nvmlSystemDriverBranchInfo_t; ++#define nvmlSystemDriverBranchInfo_v1 NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1) ++ ++/** ++ * Retrieves the driver branch of the NVIDIA driver installed on the system. ++ * ++ * For all products. ++ * ++ * The branch identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * @param branchInfo Pointer to the driver branch information structure \a nvmlSystemDriverBranchInfo_t ++ * @param length The maximum allowed length of the driver branch string ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a branchInfo is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetDriverBranch(nvmlSystemDriverBranchInfo_t *branchInfo, unsigned int length); ++ ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUnitQueries Unit Queries ++ * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. ++ * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by ++ * calling \ref nvmlUnitGetHandleByIndex(). ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++ /** ++ * Retrieves the number of units in the system. ++ * ++ * For S-class products. ++ * ++ * @param unitCount Reference in which to return the number of units ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a unitCount has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); ++ ++/** ++ * Acquire the handle for a particular unit, based on its index. ++ * ++ * For S-class products. ++ * ++ * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). ++ * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. ++ * ++ * The order in which NVML enumerates units has no guarantees of consistency between reboots. ++ * ++ * @param index The index of the target unit, >= 0 and < \a unitCount ++ * @param unit Reference in which to return the unit handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a unit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); ++ ++/** ++ * Retrieves the static information associated with a unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlUnitInfo_t for details on available unit info. ++ * ++ * @param unit The identifier of the target unit ++ * @param info Reference in which to return the unit information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); ++ ++/** ++ * Retrieves the LED state associated with this unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlLedState_t for details on allowed states. ++ * ++ * @param unit The identifier of the target unit ++ * @param state Reference in which to return the current LED state ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a state has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlUnitSetLedState() ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); ++ ++/** ++ * Retrieves the PSU stats for the unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlPSUInfo_t for details on available PSU info. ++ * ++ * @param unit The identifier of the target unit ++ * @param psu Reference in which to return the PSU information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a psu has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); ++ ++/** ++ * Retrieves the temperature readings for the unit, in degrees C. ++ * ++ * For S-class products. ++ * ++ * Depending on the product, readings may be available for intake (type=0), ++ * exhaust (type=1) and board (type=2). ++ * ++ * @param unit The identifier of the target unit ++ * @param type The type of reading to take ++ * @param temp Reference in which to return the intake temperature ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); ++ ++/** ++ * Retrieves the fan speed readings for the unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. ++ * ++ * @param unit The identifier of the target unit ++ * @param fanSpeeds Reference in which to return the fan speed information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a fanSpeeds has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); ++ ++/** ++ * Retrieves the set of GPU devices that are attached to the specified unit. ++ * ++ * For S-class products. ++ * ++ * The \a deviceCount argument is expected to be set to the size of the input \a devices array. ++ * ++ * @param unit The identifier of the target unit ++ * @param deviceCount Reference in which to provide the \a devices array size, and ++ * to return the number of attached GPU devices ++ * @param devices Reference in which to return the references to the attached GPU devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceQueries Device Queries ++ * This chapter describes that queries that NVML can perform against each device. ++ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by ++ * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), ++ * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++ /** ++ * Retrieves the number of compute devices in the system. A compute device is a single GPU. ++ * ++ * For all products. ++ * ++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system ++ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. ++ * Update your code to handle this error, or use NVML 4.304 or older nvml header file. ++ * For backward binary compatibility reasons _v1 version of the API is still present in the shared ++ * library. ++ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. ++ * ++ * @param deviceCount Reference in which to return the number of accessible devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); ++ ++/** ++ * Get attributes (engine counts etc.) for the given NVML device handle. ++ * ++ * @note This API currently only supports MIG device handles. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device NVML device handle ++ * @param attributes Device attributes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); ++ ++/** ++ * Acquire the handle for a particular device, based on its index. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a accessibleDevices count returned by ++ * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it ++ * is recommended that devices be looked up by their PCI ids or UUID. See ++ * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). ++ * ++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs if: ++ * - The target GPU is an SLI slave ++ * ++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system ++ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. ++ * Update your code to handle this error, or use NVML 4.304 or older nvml header file. ++ * For backward binary compatibility reasons _v1 version of the API is still present in the shared ++ * library. ++ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. ++ * ++ * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. ++ * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't ++ * need to worry about that. ++ * ++ * @param index The index of the target GPU, >= 0 and < \a accessibleDevices ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetIndex ++ * @see nvmlDeviceGetCount ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its board serial number. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * This number corresponds to the value printed directly on the board, and to the value returned by ++ * \ref nvmlDeviceGetSerial(). ++ * ++ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor ++ * of \ref nvmlDeviceGetHandleByUUID. ++ * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs as it searches for the target GPU ++ * ++ * @param serial The board serial number of the target GPU ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one ++ * device has the same serial (dual GPU boards) ++ * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetSerial ++ * @see nvmlDeviceGetHandleByUUID ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. ++ * ++ * For all products. ++ * ++ * @param uuid The UUID of the target GPU or MIG instance ++ * @param device Reference in which to return the device handle or MIG device handle ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs as it searches for the target GPU ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null ++ * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetUUID ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its PCI bus id. ++ * ++ * For all products. ++ * ++ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs if: ++ * - The target GPU is an SLI slave ++ * ++ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND ++ * instead of NVML_ERROR_NO_PERMISSION. ++ * ++ * @param pciBusId The PCI bus id of the target GPU ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); ++ ++/** ++ * Retrieves the name of this device. ++ * ++ * For all products. ++ * ++ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not ++ * exceed 96 characters in length (including the NULL terminator). See \ref ++ * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. ++ * ++ * When used with MIG device handles the API returns MIG device names which can be used to identify devices ++ * based on their attributes. ++ * ++ * @param device The identifier of the target device ++ * @param name Reference in which to return the product name ++ * @param length The maximum allowed length of the string returned in \a name ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); ++ ++/** ++ * Retrieves the brand of this device. ++ * ++ * For all products. ++ * ++ * The type is a member of \ref nvmlBrandType_t defined above. ++ * ++ * @param device The identifier of the target device ++ * @param type Reference in which to return the product brand type ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); ++ ++/** ++ * Retrieves the NVML index of this device. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a accessibleDevices count returned by ++ * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it ++ * is recommended that devices be looked up by their PCI ids or GPU UUID. See ++ * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). ++ * ++ * When used with MIG device handles this API returns indices that can be ++ * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. ++ * MIG device indices are unique within a device. ++ * ++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. ++ * ++ * @param device The identifier of the target device ++ * @param index Reference in which to return the NVML index of the device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a index has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetHandleByIndex() ++ * @see nvmlDeviceGetCount() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); ++ ++/** ++ * Retrieves the globally unique board serial number associated with this device's board. ++ * ++ * For all products with an inforom. ++ * ++ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). ++ * This number matches the serial number tag that is physically attached to the board. See \ref ++ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param serial Reference in which to return the board/module serial number ++ * @param length The maximum allowed length of the string returned in \a serial ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a serial has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); ++ ++/** ++ * Get a unique identifier for the device module on the baseboard ++ * ++ * This API retrieves a unique identifier for each GPU module that exists on a given baseboard. ++ * For non-baseboard products, this ID would always be 0. ++ * ++ * @param device The identifier of the target device ++ * @param moduleId Unique identifier for the GPU module ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a moduleId has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a moduleId is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetModuleId(nvmlDevice_t device, unsigned int *moduleId); ++ ++/** ++ * Retrieves the Device's C2C Mode information ++ * ++ * @param device The identifier of the target device ++ * @param c2cModeInfo Output struct containing the device's C2C Mode info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a C2C Mode Infor query is successful ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetC2cModeInfoV(nvmlDevice_t device, nvmlC2cModeInfo_v1_t *c2cModeInfo); ++ ++/***************************************************************************************************/ ++ ++/** @defgroup nvmlAffinity CPU and Memory Affinity ++ * This chapter describes NVML operations that are associated with CPU and memory ++ * affinity. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++//! Scope of NUMA node for affinity queries ++#define NVML_AFFINITY_SCOPE_NODE 0 ++//! Scope of processor socket for affinity queries ++#define NVML_AFFINITY_SCOPE_SOCKET 1 ++ ++typedef unsigned int nvmlAffinityScope_t; ++ ++/** ++ * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with ++ * the ideal memory affinity within node or socket for the device. ++ * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, ++ * result[0] = 0x3 ++ * ++ * \note If requested scope is not applicable to the target topology, the API ++ * will fall back to reporting the memory affinity for the immediate non-I/O ++ * ancestor of the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param nodeSetSize The size of the nodeSet array that is safe to access ++ * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * @param scope Scope that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); ++ ++/** ++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ++ * ideal CPU affinity within node or socket for the device. ++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, ++ * result[0] = 0x3, result[1] = 0x3 ++ * ++ * \note If requested scope is not applicable to the target topology, the API ++ * will fall back to reporting the CPU affinity for the immediate non-I/O ++ * ancestor of the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param cpuSetSize The size of the cpuSet array that is safe to access ++ * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * @param scope Scope that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cpuAffinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++ ++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); ++ ++/** ++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device ++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, ++ * result[0] = 0x3, result[1] = 0x3 ++ * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param cpuSetSize The size of the cpuSet array that is safe to access ++ * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cpuAffinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); ++ ++/** ++ * Sets the ideal affinity for the calling thread and device using the guidelines ++ * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. ++ * Older versions set the affinity for a calling process and all children. ++ * Currently supports up to 1024 processors. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the calling process has been successfully bound ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); ++ ++/** ++ * Clear all affinity bindings for the calling thread. Note, this is a change as of version ++ * 8.0 as older versions cleared the affinity for a calling process and all children. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the calling process has been successfully unbound ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); ++ ++/** ++ * Get the NUMA node of the given GPU device. ++ * This only applies to platforms where the GPUs are NUMA nodes. ++ * ++ * @param[in] device The device handle ++ * @param[out] node NUMA node ID of the device ++ * ++ * @returns ++ * - \ref NVML_SUCCESS if the NUMA node is retrieved successfully ++ * - \ref NVML_ERROR_NOT_SUPPORTED if request is not supported on the current platform ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node); ++/** ++ * Retrieve the common ancestor for two devices ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param device1 The identifier of the first device ++ * @param device2 The identifier of the second device ++ * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pathInfo has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++ ++/** @} */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); ++ ++/** ++ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the first device ++ * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs ++ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray ++ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count ++ * number of device handles. ++ * @param deviceArray An array of device handles for GPUs found at \a level ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); ++ ++/** ++ * Retrieve the status for a given p2p capability index between a given pair of GPU ++ * ++ * @param device1 The first device ++ * @param device2 The second device ++ * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 ++ * @param p2pStatus Reference in which to return the status of the \a p2pIndex ++ * between \a device1 and \a device2 ++ * @return ++ * - \ref NVML_SUCCESS if \a p2pStatus has been populated ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); ++ ++/** ++ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, ++ * that augments the immutable, board serial identifier. ++ * ++ * For all products. ++ * ++ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. ++ * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. ++ * ++ * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG ++ * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. ++ * ++ * @param device The identifier of the target device ++ * @param uuid Reference in which to return the GPU UUID ++ * @param length The maximum allowed length of the string returned in \a uuid ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a uuid has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); ++ ++/** ++ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for ++ * each GPU will have the form /dev/nvidia[minor number]. ++ * ++ * For all products. ++ * Supported only for Linux ++ * ++ * @param device The identifier of the target device ++ * @param minorNumber Reference in which to return the minor number for the device ++ * @return ++ * - \ref NVML_SUCCESS if the minor number is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); ++ ++/** ++ * Retrieves the the device board part number which is programmed into the board's InfoROM ++ * ++ * For all products. ++ * ++ * @param device Identifier of the target device ++ * @param partNumber Reference to the buffer to return ++ * @param length Length of the buffer reference ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a partNumber has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); ++ ++/** ++ * Retrieves the version information for the device's infoROM object. ++ * ++ * For all products with an inforom. ++ * ++ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate ++ * ECC counts. The version of the data structures in this memory may change from time to time. It will not ++ * exceed 16 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. ++ * ++ * See \ref nvmlInforomObject_t for details on the available infoROM objects. ++ * ++ * @param device The identifier of the target device ++ * @param object The target infoROM object ++ * @param version Reference in which to return the infoROM version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomImageVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); ++ ++/** ++ * Retrieves the global infoROM image version ++ * ++ * For all products with an inforom. ++ * ++ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board ++ * in contrast to infoROM object version which is only an indicator of supported features. ++ * Version string will not exceed 16 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param version Reference in which to return the infoROM image version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); ++ ++/** ++ * Retrieves the checksum of the configuration stored in the device's infoROM. ++ * ++ * For all products with an inforom. ++ * ++ * Can be used to make sure that two GPUs have the exact same configuration. ++ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. ++ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) ++ * ++ * @param device The identifier of the target device ++ * @param checksum Reference in which to return the infoROM configuration checksum ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a checksum has been set ++ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); ++ ++/** ++ * Reads the infoROM from the flash and verifies the checksums. ++ * ++ * For all products with an inforom. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if infoROM is not corrupted ++ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); ++ ++/** ++ * Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. ++ * ++ * For all products with an inforom. ++ * ++ * @param device The identifier of the target device ++ * @param timestamp The start timestamp of the last BBX Flush ++ * @param durationUs The duration (us) of the last BBX Flush ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a timestamp and \a durationUs are successfully retrieved ++ * - \ref NVML_ERROR_NOT_READY if the BBX object has not been flushed yet ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetLastBBXFlushTime(nvmlDevice_t device, unsigned long long *timestamp, ++ unsigned long *durationUs); ++ ++/** ++ * Retrieves the display mode for the device. ++ * ++ * For all products. ++ * ++ * This method indicates whether a physical display (e.g. monitor) is currently connected to ++ * any of the device's connectors. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param display Reference in which to return the display mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a display has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); ++ ++/** ++ * Retrieves the display active state for the device. ++ * ++ * For all products. ++ * ++ * This method indicates whether a display is initialized on the device. ++ * For example whether X Server is attached to this device and has allocated memory for the screen. ++ * ++ * Display can be active even when no monitor is physically attached. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param isActive Reference in which to return the display active state ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isActive has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); ++ ++/** ++ * Retrieves the persistence mode associated with this device. ++ * ++ * For all products. ++ * For Linux only. ++ * ++ * When driver persistence mode is enabled the driver software state is not torn down when the last ++ * client disconnects. By default this feature is disabled. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current driver persistence mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetPersistenceMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Retrieves PCI attributes of this device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlPciInfoExt_v1_t for details on the available PCI info. ++ * ++ * @param device The identifier of the target device ++ * @param pci Reference in which to return the PCI info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfoExt(nvmlDevice_t device, nvmlPciInfoExt_t *pci); ++ ++/** ++ * Retrieves the PCI attributes of this device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlPciInfo_t for details on the available PCI info. ++ * ++ * @param device The identifier of the target device ++ * @param pci Reference in which to return the PCI info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); ++ ++/** ++ * Retrieves the maximum PCIe link generation possible with this device and system ++ * ++ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will ++ * report is generation 1. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkGen Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkGen has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); ++ ++/** ++ * Retrieves the maximum PCIe link generation supported by this device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkGenDevice Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkGenDevice has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGenDevice is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGenDevice); ++ ++/** ++ * Retrieves the maximum PCIe link width possible with this device and system ++ * ++ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report ++ * a max link width of 8. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkWidth Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); ++ ++/** ++ * Retrieves the current PCIe link generation ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param currLinkGen Reference in which to return the current PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a currLinkGen has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); ++ ++/** ++ * Retrieves the current PCIe link width ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param currLinkWidth Reference in which to return the current PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a currLinkWidth has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); ++ ++/** ++ * Retrieve PCIe utilization information. ++ * This function is querying a byte counter over a 20ms interval and thus is the ++ * PCIe throughput over that interval. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * This method is not supported in virtual machines running virtual GPU (vGPU). ++ * ++ * @param device The identifier of the target device ++ * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t ++ * @param value Reference in which to return throughput in KB/s ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a value has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); ++ ++/** ++ * Retrieve the PCIe replay counter. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param value Reference in which to return the counter's value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a value has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); ++ ++/** ++ * Retrieves the current clock speeds for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlClockType_t for details on available clock information. ++ * ++ * @param device The identifier of the target device ++ * @param type Identify which clock domain to query ++ * @param clock Reference in which to return the clock speed in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clock has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); ++ ++/** ++ * Retrieves the maximum clock speeds for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlClockType_t for details on available clock information. ++ * ++ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks ++ * by few MHz. ++ * ++ * @param device The identifier of the target device ++ * @param type Identify which clock domain to query ++ * @param clock Reference in which to return the clock speed in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clock has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); ++ ++/** ++ * Retrieve the GPCCLK VF offset value ++ * @param[in] device The identifier of the target device ++ * @param[out] offset The retrieved GPCCLK VF offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); ++ ++/** ++ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. ++ * Can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the default applications clock that GPU boots with or ++ * defaults to after \ref nvmlDeviceResetApplicationsClocks call. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the default clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * \see nvmlDeviceGetApplicationsClock ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the clock speed for the clock specified by the clock type and clock ID. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockId Identify which clock in the domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the customer defined maximum boost clock speed specified by the given clock type. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param count Reference in which to provide the \a clocksMHz array size, and ++ * to return the number of elements ++ * @param clocksMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of ++ * required elements) ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetApplicationsClocks ++ * @see nvmlDeviceGetSupportedGraphicsClocks ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); ++ ++/** ++ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param memoryClockMHz Memory clock for which to return possible graphics clocks ++ * @param count Reference in which to provide the \a clocksMHz array size, and ++ * to return the number of elements ++ * @param clocksMHz Reference in which to return the clocks in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetApplicationsClocks ++ * @see nvmlDeviceGetSupportedMemoryClocks ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); ++ ++/** ++ * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. ++ * ++ * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device ++ * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will ++ * revert to when no applications are using the GPU ++ * ++ * @return ++ * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); ++ ++/** ++ * Retrieves the intended operating speed of the device's fan. ++ * ++ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the ++ * output will not match the actual fan speed. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param speed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); ++ ++ ++/** ++ * Retrieves the intended operating speed of the device's specified fan. ++ * ++ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the ++ * output will not match the actual fan speed. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the target fan, zero indexed. ++ * @param speed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); ++ ++/** ++ * Retrieves the intended target speed of the device's specified fan. ++ * ++ * Normally, the driver dynamically adjusts the fan based on ++ * the needs of the GPU. But when user set fan speed using nvmlDeviceSetFanSpeed_v2, ++ * the driver will attempt to make the fan achieve the setting in ++ * nvmlDeviceSetFanSpeed_v2. The actual current speed of the fan ++ * is reported in nvmlDeviceGetFanSpeed_v2. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the target fan, zero indexed. ++ * @param targetSpeed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); ++ ++/** ++ * Retrieves the min and max fan speed that user can set for the GPU fan. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * @param device The identifier of the target device ++ * @param minSpeed The minimum speed allowed to set ++ * @param maxSpeed The maximum speed allowed to set ++ * ++ * return ++ * NVML_SUCCESS if speed has been adjusted ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if device is invalid ++ * NVML_ERROR_NOT_SUPPORTED if the device does not support this ++ * (doesn't have fans) ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, ++ unsigned int * maxSpeed); ++ ++/** ++ * Gets current fan control policy. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * device The identifier of the target \a device ++ * policy Reference in which to return the fan control \a policy ++ * ++ * return ++ * NVML_SUCCESS if \a policy has been populated ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference ++ * a fan that exists. ++ * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, ++ nvmlFanControlPolicy_t *policy); ++ ++/** ++ * Retrieves the number of fans on the device. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * @param device The identifier of the target device ++ * @param numFans The number of fans ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a fan number query was successful ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a numFans is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans); ++ ++/** ++ * Retrieves the current temperature readings for the device, in degrees C. ++ * ++ * For all products. ++ * ++ * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. ++ * ++ * @param device The identifier of the target device ++ * @param sensorType Flag that indicates which sensor reading to retrieve ++ * @param temp Reference in which to return the temperature reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); ++ ++ ++/** ++ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. ++ * ++ * Note: This API is no longer the preferred interface for retrieving the following temperature thresholds ++ * on Ada and later architectures: NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, ++ * NVML_TEMPERATURE_THRESHOLD_MEM_MAX and NVML_TEMPERATURE_THRESHOLD_GPU_MAX. ++ * ++ * Support for reading these temperature thresholds for Ada and later architectures would be removed from this ++ * API in future releases. Please use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_TEMPERATURE_* fields to retrieve ++ * temperature thresholds on these architectures. ++ * ++ * @param device The identifier of the target device ++ * @param thresholdType The type of threshold value queried ++ * @param temp Reference in which to return the temperature reading ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); ++ ++/** ++ * Used to execute a list of thermal system instructions. ++ * ++ * @param device The identifier of the target device ++ * @param sensorIndex The index of the thermal sensor ++ * @param pThermalSettings Reference in which to return the thermal sensor information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pThermalSettings has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pThermalSettings is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings); ++ ++/** ++ * Retrieves the current performance state for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlPstates_t for details on allowed performance states. ++ * ++ * @param device The identifier of the target device ++ * @param pState Reference in which to return the performance state reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pState has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); ++ ++/** ++ * Retrieves current clocks event reasons. ++ * ++ * For all fully supported products. ++ * ++ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. ++ * ++ * @param device The identifier of the target device ++ * @param clocksEventReasons Reference in which to return bitmask of active clocks event ++ * reasons ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clocksEventReasons has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksEventReasons is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlClocksEventReasons ++ * @see nvmlDeviceGetSupportedClocksEventReasons ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, unsigned long long *clocksEventReasons); ++ ++/** ++ * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); ++ ++/** ++ * Retrieves bitmask of supported clocks event reasons that can be returned by ++ * \ref nvmlDeviceGetCurrentClocksEventReasons ++ * ++ * For all fully supported products. ++ * ++ * This method is not supported in virtual machines running virtual GPU (vGPU). ++ * ++ * @param device The identifier of the target device ++ * @param supportedClocksEventReasons Reference in which to return bitmask of supported ++ * clocks event reasons ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a supportedClocksEventReasons has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksEventReasons is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlClocksEventReasons ++ * @see nvmlDeviceGetCurrentClocksEventReasons ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t device, unsigned long long *supportedClocksEventReasons); ++ ++/** ++ * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); ++ ++/** ++ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. ++ * ++ * Retrieve the current performance state for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlPstates_t for details on allowed performance states. ++ * ++ * @param device The identifier of the target device ++ * @param pState Reference in which to return the performance state reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pState has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); ++ ++/** ++ * Retrieve performance monitor samples from the associated subdevice. ++ * ++ * @param device ++ * @param pDynamicPstatesInfo ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); ++ ++/** ++ * Retrieve the MemClk (Memory Clock) VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] offset The retrieved MemClk VF offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); ++ ++/** ++ * Retrieve min and max clocks of some clock domain for a given PState ++ * ++ * @param device The identifier of the target device ++ * @param type Clock domain ++ * @param pstate PState to query ++ * @param minClockMHz Reference in which to return min clock frequency ++ * @param maxClockMHz Reference in which to return max clock frequency ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a minClockMHz and \a maxClockMHz are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, ++ unsigned int * minClockMHz, unsigned int * maxClockMHz); ++ ++/** ++ * Get all supported Performance States (P-States) for the device. ++ * ++ * The returned array would contain a contiguous list of valid P-States supported by ++ * the device. If the number of supported P-States is fewer than the size of the array ++ * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. ++ * ++ * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. ++ * ++ * @param device The identifier of the target device ++ * @param pstates Container to return the list of performance states ++ * supported by device ++ * @param size Size of the supplied \a pstates array in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pstates array has been retrieved ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to ++ * hold the resulting list ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, ++ nvmlPstates_t *pstates, unsigned int size); ++ ++/** ++ * Retrieve the GPCCLK min max VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] minOffset The retrieved GPCCLK VF min offset value ++ * @param[out] maxOffset The retrieved GPCCLK VF max offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, ++ int *minOffset, int *maxOffset); ++ ++/** ++ * Retrieve the MemClk (Memory Clock) min max VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] minOffset The retrieved MemClk VF min offset value ++ * @param[out] maxOffset The retrieved MemClk VF max offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, ++ int *minOffset, int *maxOffset); ++ ++/** ++ * Retrieve min, max and current clock offset of some clock domain for a given PState ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Note: \ref nvmlDeviceGetGpcClkVfOffset, \ref nvmlDeviceGetMemClkVfOffset, \ref nvmlDeviceGetGpcClkMinMaxVfOffset and ++ * \ref nvmlDeviceGetMemClkMinMaxVfOffset will be deprecated in a future release. ++ Use \ref nvmlDeviceGetClockOffsets instead. ++ * ++ * @param device The identifier of the target device ++ * @param info Structure specifying the clock type (input) and the pstate (input) ++ * retrieved clock offset value (output), min clock offset (output) ++ * and max clock offset (output) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a minClockOffsetMHz and \a maxClockOffsetMHz are NULL ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); ++ ++/** ++ * Control current clock offset of some clock domain for a given PState ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param info Structure specifying the clock type (input), the pstate (input) ++ * and clock offset value (input) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a clockOffsetMHz is out of allowed range. ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); ++ ++/** ++ * This API has been deprecated. ++ * ++ * Retrieves the power management mode associated with this device. ++ * ++ * For products from the Fermi family. ++ * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. ++ * ++ * For from the Kepler or newer families. ++ * - Does not require \a NVML_INFOROM_POWER object. ++ * ++ * This flag indicates whether any power management algorithm is currently active on the device. An ++ * enabled state does not necessarily mean the device is being actively throttled -- only that ++ * that the driver will do so if the appropriate conditions are met. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current power management mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Retrieves the power management limit associated with this device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * The power limit defines the upper boundary for the card's power draw. If ++ * the card's total power draw reaches this limit the power management algorithm kicks in. ++ * ++ * This reading is only available if power management mode is supported. ++ * See \ref nvmlDeviceGetPowerManagementMode. ++ * ++ * @param device The identifier of the target device ++ * @param limit Reference in which to return the power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); ++ ++/** ++ * Retrieves information about possible values of power management limits on this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minLimit Reference in which to return the minimum power management limit in milliwatts ++ * @param maxLimit Reference in which to return the maximum power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetPowerManagementLimit ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); ++ ++/** ++ * Retrieves default power management limit on this device, in milliwatts. ++ * Default power management limit is a power management limit that the device boots with. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param defaultLimit Reference in which to return the default power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a defaultLimit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); ++ ++/** ++ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. On Ampere ++ * (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval. On GA100 and ++ * older architectures, instantaneous power is returned. ++ * ++ * See \ref NVML_FI_DEV_POWER_AVERAGE and \ref NVML_FI_DEV_POWER_INSTANT to query specific power ++ * values. ++ * ++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. ++ * ++ * @param device The identifier of the target device ++ * @param power Reference in which to return the power usage information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a power has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); ++ ++/** ++ * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param energy Reference in which to return the energy consumption information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a energy has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); ++ ++/** ++ * Get the effective power limit that the driver enforces after taking into account all limiters ++ * ++ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere ++ * This includes the out of band power limit interface ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The device to communicate with ++ * @param limit Reference in which to return the power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); ++ ++/** ++ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). ++ * ++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. ++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. ++ * Not supported on Quadro ® and Tesla &tm; C-class products. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current GOM ++ * @param pending Reference in which to return the pending GOM ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlGpuOperationMode_t ++ * @see nvmlDeviceSetGpuOperationMode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); ++ ++/** ++ * Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. ++ * The reserved amount is supported on version 2 only. ++ * ++ * For all products. ++ * ++ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. ++ * Under WDDM most device memory is allocated and managed on startup by Windows. ++ * ++ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated ++ * by all active channels on the device. ++ * ++ * See \ref nvmlMemory_v2_t for details on available memory info. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate ++ * information, only if the caller has appropriate privileges. Per-instance ++ * information can be queried by using specific MIG device handles. ++ * ++ * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. ++ * ++ * @note On systems where GPUs are NUMA nodes, the accuracy of FB memory utilization ++ * provided by this API depends on the memory accounting of the operating system. ++ * This is because FB memory is managed by the operating system instead of the NVIDIA GPU driver. ++ * Typically, pages allocated from FB memory are not released even after ++ * the process terminates to enhance performance. In scenarios where ++ * the operating system is under memory pressure, it may resort to utilizing FB memory. ++ * Such actions can result in discrepancies in the accuracy of memory reporting. ++ * ++ * @param device The identifier of the target device ++ * @param memory Reference in which to return the memory information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memory has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); ++ ++/** ++ * nvmlDeviceGetMemoryInfo_v2 accounts separately for reserved memory and includes it in the used memory amount. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory); ++ ++/** ++ * Retrieves the current compute mode for the device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlComputeMode_t for details on allowed compute modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current compute mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetComputeMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); ++ ++/** ++ * Retrieves the CUDA compute capability of the device. ++ * ++ * For all products. ++ * ++ * Returns the major and minor compute capability version numbers of the ++ * device. The major and minor versions are equivalent to the ++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and ++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be ++ * returned by CUDA's cuDeviceGetAttribute(). ++ * ++ * @param device The identifier of the target device ++ * @param major Reference in which to return the major CUDA compute capability ++ * @param minor Reference in which to return the minor CUDA compute capability ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a major and \a minor have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); ++ ++/** ++ * Retrieves the current and pending ECC modes for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * ++ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following ++ * the next reboot. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current ECC mode ++ * @param pending Reference in which to return the pending ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current and \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); ++ ++/** ++ * Retrieves the default ECC modes for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param defaultMode Reference in which to return the default ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current and \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); ++ ++/** ++ * Retrieves the device boardId from 0-N. ++ * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with ++ * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. ++ * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across ++ * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and ++ * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will ++ * always return those values but they will always be different from each other). ++ * ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param boardId Reference in which to return the device's board ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a boardId has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); ++ ++/** ++ * Retrieves whether the device is on a Multi-GPU Board ++ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param multiGpuBool Reference in which to return a zero or non-zero value ++ * to indicate whether the device is on a multi GPU board ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a multiGpuBool has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); ++ ++/** ++ * Retrieves the total ECC error counts for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * Requires ECC Mode to be enabled. ++ * ++ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of ++ * errors across the entire device. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types. ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of the errors. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param eccCounts Reference in which to return the specified ECC errors ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a eccCounts has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceClearEccErrorCounts() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); ++ ++/** ++ * Retrieves the detailed ECC error counts for the device. ++ * ++ * @deprecated This API supports only a fixed set of ECC error locations ++ * On different GPU architectures different locations are supported ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. ++ * Requires ECC Mode to be enabled. ++ * ++ * Detailed errors provide separate ECC counts for specific parts of the memory system. ++ * ++ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n ++ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of the errors. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param eccCounts Reference in which to return the specified ECC errors ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a eccCounts has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceClearEccErrorCounts() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); ++ ++/** ++ * Retrieves the requested memory error counter for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. ++ * ++ * Only applicable to devices with ECC. ++ * ++ * Requires ECC Mode to be enabled. ++ * ++ * @note On MIG-enabled GPUs, per instance information can be queried using specific ++ * MIG device handles. Per instance information is currently only supported for ++ * non-DRAM uncorrectable volatile errors. Querying volatile errors using device ++ * handles is currently not supported. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n ++ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of error. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param locationType Specifies the location of the counter. ++ * @param count Reference in which to return the ECC counter ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is ++ * invalid, or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, ++ nvmlEccCounterType_t counterType, ++ nvmlMemoryLocation_t locationType, unsigned long long *count); ++ ++/** ++ * Retrieves the current utilization rates for the device's major subsystems. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlUtilization_t for details on available utilization rates. ++ * ++ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. ++ * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. ++ * ++ * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference in which to return the utilization information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the Encoder ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for encoder utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param encoderQueryType Type of encoder to query ++ * @param encoderCapacity Reference to an unsigned int for the encoder capacity ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); ++ ++/** ++ * Retrieves the current encoder statistics for a given device. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param sessionCount Reference to an unsigned int for count of active encoder sessions ++ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions ++ * @param averageLatency Reference to an unsigned int for encode latency in microseconds ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, ++ * or \a averageLatency is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, ++ unsigned int *averageFps, unsigned int *averageLatency); ++ ++/** ++ * Retrieves information about active encoder sessions on a target device. ++ * ++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The ++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. ++ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return ++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++ * @param sessionInfos Reference in which to return the session information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionInfos is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the Decoder ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for decoder utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the JPG ++ * ++ * %TURING_OR_NEWER% ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for jpg utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetJpgUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the OFA (Optical Flow Accelerator) ++ * ++ * %TURING_OR_NEWER% ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for ofa utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetOfaUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++* Retrieves the active frame buffer capture sessions statistics for a given device. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param device The identifier of the target device ++* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats ++* ++* @return ++* - \ref NVML_SUCCESS if \a fbcStats is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); ++ ++/** ++* Retrieves information about active frame buffer capture sessions on a target device. ++* ++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++* written to the buffer. ++* ++* If the supplied buffer is not large enough to accommodate the active session array, the function returns ++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. ++* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return ++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may ++* be zero if there are no new frames captured since the session started. ++* ++* @param device The identifier of the target device ++* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++* @param sessionInfo Reference in which to return the session information ++* ++* @return ++* - \ref NVML_SUCCESS if \a sessionInfo is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); ++ ++/** ++ * Retrieves the current and pending driver model for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * For windows only. ++ * ++ * On Windows platforms the device driver can run in either WDDM, MCDM or WDM (TCC) modes. If a display is attached ++ * to the device it must run in WDDM mode. MCDM mode is preferred if a display is not attached. TCC mode is deprecated. ++ * ++ * See \ref nvmlDriverModel_t for details on available driver models. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current driver model ++ * @param pending Reference in which to return the pending driver model ++ * ++ * @return ++ * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetDriverModel_v2() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); ++ ++/** ++ * Get VBIOS version of the device. ++ * ++ * For all products. ++ * ++ * The VBIOS version may change from time to time. It will not exceed 32 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param version Reference to which to return the VBIOS version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); ++ ++/** ++ * Get Bridge Chip Information for all the bridge chips on the board. ++ * ++ * For all fully supported products. ++ * Only applicable to multi-GPU products. ++ * ++ * @param device The identifier of the target device ++ * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy ++ * ++ * @return ++ * - \ref NVML_SUCCESS if bridge chip exists ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); ++ ++/** ++ * Get information about processes with a compute context on a device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * This function returns information only about compute running processes (e.g. CUDA application which have ++ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. ++ * ++ * To query the current number of running compute processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new compute processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about processes with a graphics context on a device ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * This function returns information only about graphics based processes ++ * (eg. applications using OpenGL, DirectX) ++ * ++ * To query the current number of running graphics processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new graphics processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about processes with a Multi-Process Service (MPS) compute context on a device ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * This function returns information only about compute running processes (e.g. CUDA application which have ++ * active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by ++ * this function. ++ * ++ * To query the current number of running compute processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new compute processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about running processes on a device for input context ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * This function returns information only about running processes (e.g. CUDA application which have ++ * active context). ++ * ++ * To determine the size of the \a plist->procArray array to allocate, call the function with ++ * \a plist->numProcArrayEntries set to zero and \a plist->procArray set to NULL. The return ++ * code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type ++ * \a plist->mode to report on, in which case the \a plist->numProcArrayEntries field will ++ * indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type ++ * \a plist->mode exist). ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a plist->procArray table in case new processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in ++ * vGPU Host virtualization mode. ++ * Protected memory usage is currently not available in MIG mode and in windows. ++ * ++ * @param device The device handle or MIG device handle ++ * @param plist Reference in which to process detail list ++ * \a plist->version The api version ++ * \a plist->mode The process mode ++ * \a plist->procArray Reference in which to return the process information ++ * \a plist->numProcArrayEntries Proc array size of returned entries ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a plist->numprocArrayEntries and \a plist->procArray have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a plist->numprocArrayEntries indicates that the \a plist->procArray is too small ++ * \a plist->numprocArrayEntries will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a plist is NULL, \a plist->version is invalid, ++ * \a plist->mode is invalid, ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t *plist); ++ ++/** ++ * Check if the GPU devices are on the same physical board. ++ * ++ * For all fully supported products. ++ * ++ * @param device1 The first GPU device ++ * @param device2 The second GPU device ++ * @param onSameBoard Reference in which to return the status. ++ * Non-zero indicates that the GPUs are on the same board. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a onSameBoard has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); ++ ++/** ++ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. ++ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. ++ * ++ * For all fully supported products. ++ * ++ * @param device The identifier of the target device ++ * @param apiType Target API type for this operation ++ * @param isRestricted Reference in which to return the current restriction ++ * NVML_FEATURE_ENABLED indicates that the API is root-only ++ * NVML_FEATURE_DISABLED indicates that the API is accessible to all users ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isRestricted has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support ++ * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is ++ * not supported by the device) ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlRestrictedAPI_t ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); ++ ++/** ++ * Gets recent samples for the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by ++ * the driver. ++ * ++ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. ++ * ++ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. ++ * The returned samplesCount will provide the number of samples that can be queried. The user needs to ++ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). ++ * ++ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the ++ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query ++ * to get more recent samples. ++ * ++ * This method fetches the number of entries which can be accommodated in the provided samples array, and the ++ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this ++ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. ++ * ++ * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES ++ * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. ++ * ++ * @param device The identifier for the target device ++ * @param type Type of sampling event ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t ++ * @param sampleCount Reference to provide the number of elements which can be queried in samples array ++ * @param samples Reference in which samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or ++ * reference to \a sampleCount is 0 for non null \a samples ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, ++ nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); ++ ++/** ++ * Gets Total, Available and Used size of BAR1 memory. ++ * ++ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party ++ * devices (peer-to-peer on the PCIE bus). ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate ++ * information, only if the caller has appropriate privileges. Per-instance ++ * information can be queried by using specific MIG device handles. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param bar1Memory Reference in which BAR1 memory ++ * information is returned. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); ++ ++/** ++ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power ++ * or thermal constraints. ++ * ++ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The ++ * difference in violation times at two different reference times gives the indication of GPU throttling event. ++ * ++ * Violation for thermal capping is not supported at this time. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param perfPolicyType Represents Performance policy which can trigger GPU throttling ++ * @param violTime Reference to which violation time related information is returned ++ * ++ * ++ * @return ++ * - \ref NVML_SUCCESS if violation time is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); ++ ++/** ++ * Gets the device's interrupt number ++ * ++ * @param device The identifier of the target device ++ * @param irqNum The interrupt number associated with the specified device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if irq number is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a irqNum is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqNum); ++ ++/** ++ * Gets the device's core count ++ * ++ * @param device The identifier of the target device ++ * @param numCores The number of cores for the specified device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if Gpu core count is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int *numCores); ++ ++/** ++ * Gets the devices power source ++ * ++ * @param device The identifier of the target device ++ * @param powerSource The power source of the device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the current power source was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a powerSource is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t *powerSource); ++ ++/** ++ * Gets the device's memory bus width ++ * ++ * @param device The identifier of the target device ++ * @param busWidth The devices's memory bus width ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a busWidth is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int *busWidth); ++ ++/** ++ * Gets the device's PCIE Max Link speed in MBPS ++ * ++ * @param device The identifier of the target device ++ * @param maxSpeed The devices's PCIE Max Link speed in MBPS ++ * ++ * @return ++ * - \ref NVML_SUCCESS if Pcie Max Link Speed is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a maxSpeed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed); ++ ++/** ++ * Gets the device's PCIe Link speed in Mbps ++ * ++ * @param device The identifier of the target device ++ * @param pcieSpeed The devices's PCIe Max Link speed in Mbps ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pcieSpeed has been retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pcieSpeed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support PCIe speed getting ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed); ++ ++/** ++ * Gets the device's Adaptive Clock status ++ * ++ * @param device The identifier of the target device ++ * @param adaptiveClockStatus The current adaptive clocking status, either ++ * \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED ++ * or \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a adaptiveClockStatus is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); ++ ++/** ++ * Get the type of the GPU Bus (PCIe, PCI, ...) ++ * ++ * @param device The identifier of the target device ++ * @param type The PCI Bus type ++ * ++ * return ++ * - \ref NVML_SUCCESS if the bus \a type is successfully retreived ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a type is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); ++ ++ ++ /** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead ++ * ++ * Get fabric information associated with the device. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager ++ * Upon successful registration, the GPU is added to the NVLink fabric to enable ++ * peer-to-peer communication. ++ * This API reports the current state of the GPU in the NVLink fabric ++ * along with other useful information. ++ * ++ * ++ * @param device The identifier of the target device ++ * @param gpuFabricInfo Information about GPU fabric state ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); ++ ++/** ++* Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned ++* \ref nvmlGpuFabricInfo_v2_t or later output structure. ++* ++* @note The caller must set the \ref nvmlGpuFabricInfoV_t.version field to the ++* appropriate version prior to calling this function. For example: ++* \code ++* nvmlGpuFabricInfoV_t fabricInfo = ++* { .version = nvmlGpuFabricInfo_v2 }; ++* nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo); ++* \endcode ++* ++* For Hopper &tm; or newer fully supported devices. ++* ++* @param device The identifier of the target device ++* @param gpuFabricInfo Information about GPU fabric state ++* ++* @return ++* - \ref NVML_SUCCESS Upon success ++* - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, ++ nvmlGpuFabricInfoV_t *gpuFabricInfo); ++ ++/** ++ * Set new power limit of this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. ++ * ++ * See \ref nvmlPowerValue_v2_t for more information on the struct. ++ * ++ * \note Limit is not persistent across reboots or driver unloads. ++ * Enable persistent mode to prevent driver from unloading when no application is using the device. ++ * ++ * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. ++ * ++ * @param device The identifier of the target device ++ * @param powerValue Power management limit in milliwatts to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see NVML_FI_DEV_POWER_AVERAGE ++ * @see NVML_FI_DEV_POWER_INSTANT ++ * @see NVML_FI_DEV_POWER_MIN_LIMIT ++ * @see NVML_FI_DEV_POWER_MAX_LIMIT ++ * @see NVML_FI_DEV_POWER_CURRENT_LIMIT ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); ++ ++/** ++ * Get SRAM ECC error status of this device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct. ++ * ++ * @param device The identifier of the target device ++ * @param status Returns SRAM ECC error status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a nvmlEccSramErrorStatus_t is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device, ++ nvmlEccSramErrorStatus_t *status); ++ ++/** ++ * Get Conf Computing System capabilities. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param capabilities System CC capabilities ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a capabilities were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); ++ ++/** ++ * Get Conf Computing System State. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param state System CC State ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); ++ ++/** ++ * Get Conf Computing Protected and Unprotected Memory Sizes. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device Device handle ++ * @param memInfo Protected/Unprotected Memory sizes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memInfo were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); ++ ++/** ++ * Get Conf Computing GPUs ready state. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param isAcceptingWork Returns GPU current work accepting state, ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE ++ * ++ * return ++ * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); ++ ++/** ++ * Get Conf Computing protected memory usage. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param memory Reference in which to return the memory information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memory has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); ++ ++/** ++ * Get Conf Computing Gpu certificate details. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param gpuCert Reference in which to return the gpu certificate information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu certificate info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, ++ nvmlConfComputeGpuCertificate_t *gpuCert); ++ ++/** ++ * Get Conf Computing Gpu attestation report. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param gpuAtstReport Reference in which to return the gpu attestation report ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu attestation report has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, ++ nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); ++/** ++ * Get Conf Computing key rotation threshold detail. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param pKeyRotationThrInfo Reference in which to return the key rotation threshold data ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu key rotation threshold info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeKeyRotationThresholdInfo( ++ nvmlConfComputeGetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); ++ ++/** ++ * Set Conf Computing Unprotected Memory Size. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device Device Handle ++ * @param sizeKiB Unprotected Memory size to be set in KiB ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sizeKiB successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); ++ ++/** ++ * Set Conf Computing GPUs ready state. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE ++ * ++ * return ++ * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); ++ ++/** ++ * Set Conf Computing key rotation threshold. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * This function is to set the confidential compute key rotation threshold parameters. ++ * \a pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from ++ * NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX. ++ * Default value is 60. ++ * ++ * @param pKeyRotationThrInfo Reference to the key rotation threshold data ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a key rotation threashold max attacker advantage has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_INVALID_STATE if confidential compute GPU ready state is enabled ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeKeyRotationThresholdInfo( ++ nvmlConfComputeSetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); ++ ++/** ++ * Get Conf Computing System Settings. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param settings System CC settings ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the query is success ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeSettings(nvmlSystemConfComputeSettings_t *settings); ++ ++/** ++ * Retrieve GSP firmware version. ++ * ++ * The caller passes in buffer via \a version and corresponding GSP firmware numbered version ++ * is returned with the same parameter in string format. ++ * ++ * @param device Device handle ++ * @param version The retrieved GSP firmware version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); ++ ++/** ++ * Retrieve GSP firmware mode. ++ * ++ * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with ++ * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. ++ * ++ * @param device Device handle ++ * @param isEnabled Pointer to specify if GSP firmware is enabled ++ * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); ++ ++/** ++ * @} ++ */ ++ ++/** @addtogroup nvmlAccountingStats ++ * @{ ++ */ ++ ++/** ++ * Queries the state of per process accounting mode. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlDeviceGetAccountingStats for more details. ++ * See \ref nvmlDeviceSetAccountingMode ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Queries process's accounting stats. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. ++ * Accounting stats can be queried during life time of the process and after its termination. ++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and ++ * updated to actual running time after its termination. ++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old ++ * processes. ++ * ++ * See \ref nvmlAccountingStats_t for description of each returned metric. ++ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. ++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be ++ * queried since they don't contribute to GPU utilization. ++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported ++ * ++ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. ++ * ++ * @param device The identifier of the target device ++ * @param pid Process Id of the target process to query stats for ++ * @param stats Reference in which to return the process's accounting stats ++ * ++ * @return ++ * - \ref NVML_SUCCESS if stats have been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL ++ * - \ref NVML_ERROR_NOT_FOUND if process stats were not found ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled ++ * or on vGPU host. ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingBufferSize ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); ++ ++/** ++ * Queries list of processes that can be queried for accounting stats. The list of processes returned ++ * can be in running or terminated state. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * To just query the number of processes ready to be queried, call this function with *count = 0 and ++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. ++ * ++ * For more details see \ref nvmlDeviceGetAccountingStats. ++ * ++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full. ++ * ++ * @param device The identifier of the target device ++ * @param count Reference in which to provide the \a pids array size, and ++ * to return the number of elements ready to be queried ++ * @param pids Reference in which to return list of process ids ++ * ++ * @return ++ * - \ref NVML_SUCCESS if pids were successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled ++ * or on vGPU host. ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to ++ * expected value) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingBufferSize ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); ++ ++/** ++ * Returns the number of processes that the circular buffer with accounting pids can hold. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * This is the maximum number of processes that accounting information will be stored for before information ++ * about oldest processes will get overwritten by information about new processes. ++ * ++ * @param device The identifier of the target device ++ * @param bufferSize Reference in which to provide the size (in number of elements) ++ * of the circular buffer for accounting stats. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if buffer size was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingStats ++ * @see nvmlDeviceGetAccountingPids ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); ++ ++/** @} */ ++ ++/** @addtogroup nvmlDeviceQueries ++ * @{ ++ */ ++ ++/** ++ * Returns the list of retired pages by source, including pages that are pending retirement ++ * The address information provided from this API is the hardware address of the page that was retired. Note ++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param cause Filter page addresses by cause of retirement ++ * @param pageCount Reference in which to provide the \a addresses buffer size, and ++ * to return the number of retired pages that match \a cause ++ * Set to 0 to query the size without allocating an \a addresses buffer ++ * @param addresses Buffer to write the page addresses into ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the ++ * matching page addresses. \a pageCount is set to the needed size. ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or ++ * \a addresses is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, ++ unsigned int *pageCount, unsigned long long *addresses); ++ ++/** ++ * Returns the list of retired pages by source, including pages that are pending retirement ++ * The address information provided from this API is the hardware address of the page that was retired. Note ++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 ++ * ++ * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's ++ * retirement. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param cause Filter page addresses by cause of retirement ++ * @param pageCount Reference in which to provide the \a addresses buffer size, and ++ * to return the number of retired pages that match \a cause ++ * Set to 0 to query the size without allocating an \a addresses buffer ++ * @param addresses Buffer to write the page addresses into ++ * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the ++ * matching page addresses. \a pageCount is set to the needed size. ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or ++ * \a addresses is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, ++ unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); ++ ++/** ++ * Check if any pages are pending retirement and need a reboot to fully retire. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param isPending Reference in which to return the pending status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isPending was populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); ++ ++/** ++ * Get number of remapped rows. The number of rows reported will be based on ++ * the cause of the remapping. isPending indicates whether or not there are ++ * pending remappings. A reset will be required to actually remap the row. ++ * failureOccurred will be set if a row remapping ever failed in the past. A ++ * pending remapping won't affect future work on the GPU since ++ * error-containment and dynamic page blacklisting will take care of that. ++ * ++ * @note On MIG-enabled GPUs with active instances, querying the number of ++ * remapped rows is not supported ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param corrRows Reference for number of rows remapped due to correctable errors ++ * @param uncRows Reference for number of rows remapped due to uncorrectable errors ++ * @param isPending Reference for whether or not remappings are pending ++ * @param failureOccurred Reference that is set when a remapping has failed in the past ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN Unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, ++ unsigned int *isPending, unsigned int *failureOccurred); ++ ++/** ++ * Get the row remapper histogram. Returns the remap availability for each bank ++ * on the GPU. ++ * ++ * @param device Device handle ++ * @param values Histogram values ++ * ++ * @return ++ * - \ref NVML_SUCCESS On success ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); ++ ++/** ++ * Get architecture for device ++ * ++ * @param device The identifier of the target device ++ * @param arch Reference where architecture is returned, if call successful. ++ * Set to NVML_DEVICE_ARCH_* upon success ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); ++ ++/** ++ * Retrieves the frequency monitor fault status for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root user. ++ * ++ * See \ref nvmlClkMonStatus_t for details on decoding the status output. ++ * ++ * @param device The identifier of the target device ++ * @param status Reference in which to return the clkmon fault status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a status has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetClkMonStatus() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); ++ ++/** ++ * Retrieves the current utilization and process ID ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. ++ * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at ++ * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization ++ * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values ++ * are returned as "unsigned int" values. If no valid sample entries are found since the lastSeenTimeStamp, NVML_ERROR_NOT_FOUND ++ * is returned. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilization set to NULL. The caller should allocate a buffer of size ++ * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed ++ * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. ++ * ++ * On successful return, the function updates \a processSamplesCount with the number of process utilization sample ++ * structures that were actually written. This may differ from a previously read value as instances are created or ++ * destroyed. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned ++ * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, ++ unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); ++ ++/** ++ * Retrieves the recent utilization and process ID for all running processes ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder, jpeg decoder, OFA (Optical Flow Accelerator) ++ * for all running processes. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at ++ * by \a procesesUtilInfo->procUtilArray. One utilization sample structure is returned per process running, that had some non-zero utilization ++ * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values ++ * are returned as "unsigned int" values. ++ * ++ * The caller should allocate a buffer of size processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t). If the buffer is too small, the API will ++ * return \a NVML_ERROR_INSUFFICIENT_SIZE, with the recommended minimal buffer size at \a procesesUtilInfo->processSamplesCount. The caller should ++ * invoke the function again with the allocated buffer passed in \a procesesUtilInfo->procUtilArray, and \a procesesUtilInfo->processSamplesCount ++ * set to the number no less than the recommended value by the previous API return. ++ * ++ * On successful return, the function updates \a procesesUtilInfo->processSamplesCount with the number of process utilization info structures ++ * that were actually written. This may differ from a previously read value as instances are created or destroyed. ++ * ++ * \a procesesUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a procesesUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * \a procesesUtilInfo->version is the version number of the structure nvmlProcessesUtilizationInfo_t, the caller should set the correct version ++ * number to retrieve the specific version of processes utilization information. ++ * ++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param procesesUtilInfo Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t. ++ ++ * @return ++ * - \ref NVML_SUCCESS if \a procesesUtilInfo->procUtilArray has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a procesesUtilInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a procesesUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small. ++ * The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call ++ * the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, nvmlProcessesUtilizationInfo_t *procesesUtilInfo); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUnitCommands Unit Commands ++ * This chapter describes NVML operations that change the state of the unit. For S-class products. ++ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION ++ * error code when invoking any of these methods. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Set the LED state for the unit. The LED can be either green (0) or amber (1). ++ * ++ * For S-class products. ++ * Requires root/admin permissions. ++ * ++ * This operation takes effect immediately. ++ * ++ * ++ * Current S-Class products don't provide unique LEDs for each unit. As such, both front ++ * and back LEDs will be toggled in unison regardless of which unit is specified with this command. ++ * ++ * See \ref nvmlLedColor_t for available colors. ++ * ++ * @param unit The identifier of the target unit ++ * @param color The target LED color ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the LED color has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlUnitGetLedState() ++ */ ++nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceCommands Device Commands ++ * This chapter describes NVML operations that change the state of the device. ++ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION ++ * error code when invoking any of these methods. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Set the persistence mode for the device. ++ * ++ * For all products. ++ * For Linux only. ++ * Requires root/admin permissions. ++ * ++ * The persistence mode determines whether the GPU driver software is torn down after the last client ++ * exits. ++ * ++ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the ++ * persistence mode is reset to "Disabled". ++ * ++ * See \ref nvmlEnableState_t for available modes. ++ * ++ * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA ++ * memory, the given device handle will no longer be valid, and to continue to interact with this ++ * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This ++ * limitation is currently only applicable to devices that have a coherent NVLink connection to ++ * system memory. ++ * ++ * @param device The identifier of the target device ++ * @param mode The target persistence mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the persistence mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetPersistenceMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); ++ ++/** ++ * Set the compute mode for the device. ++ * ++ * For all products. ++ * Requires root/admin permissions. ++ * ++ * The compute mode determines whether a GPU can be used for compute operations and whether it can ++ * be shared across contexts. ++ * ++ * This operation takes effect immediately. Under Linux it is not persistent across reboots and ++ * always resets to "Default". Under windows it is persistent. ++ * ++ * Under windows compute mode may only be set to DEFAULT when running in WDDM ++ * ++ * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. ++ * ++ * See \ref nvmlComputeMode_t for details on available compute modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode The target compute mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the compute mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetComputeMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); ++ ++/** ++ * Set the ECC mode for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * Requires root/admin permissions. ++ * ++ * The ECC mode determines whether the GPU enables its ECC support. ++ * ++ * This operation takes effect after the next reboot. ++ * ++ * See \ref nvmlEnableState_t for details on available modes. ++ * ++ * @param device The identifier of the target device ++ * @param ecc The target ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the ECC mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); ++ ++/** ++ * Clear the ECC error and other memory error counts for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. ++ * Requires root/admin permissions. ++ * Requires ECC Mode to be enabled. ++ * ++ * Sets all of the specified ECC counters to 0, including both detailed and total counts. ++ * ++ * This operation takes effect immediately. ++ * ++ * See \ref nvmlMemoryErrorType_t for details on available counter types. ++ * ++ * @param device The identifier of the target device ++ * @param counterType Flag that indicates which type of errors should be cleared. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the error counts were cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see ++ * - nvmlDeviceGetDetailedEccErrors() ++ * - nvmlDeviceGetTotalEccErrors() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); ++ ++/** ++ * Set the driver model for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * For windows only. ++ * Requires root/admin permissions. ++ * ++ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached ++ * to the device it must run in WDDM mode. ++ * ++ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). ++ * This should only be done if the host is subsequently powered down and the display is detached from the device ++ * before the next reboot. ++ * ++ * This operation takes effect after the next reboot. ++ * ++ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. ++ * ++ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or ++ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. ++ * ++ * See \ref nvmlDriverModel_t for details on available driver models. ++ * See \ref nvmlFlagDefault and \ref nvmlFlagForce ++ * ++ * @param device The identifier of the target device ++ * @param driverModel The target driver model ++ * @param flags Flags that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the driver model has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetDriverModel() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); ++ ++typedef enum nvmlClockLimitId_enum { ++ NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, ++ NVML_CLOCK_LIMIT_ID_TDP, ++ NVML_CLOCK_LIMIT_ID_UNLIMITED ++} nvmlClockLimitId_t; ++ ++/** ++ * Set clocks that device will lock to. ++ * ++ * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. ++ * Setting this will supersede application clock values and take effect regardless if a cuda app is running. ++ * See /ref nvmlDeviceSetApplicationsClocks ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. ++ * See the table below for valid combinations of these values. ++ * ++ * minGpuClock | maxGpuClock | Effect ++ * ------------+-------------+-------------------------------------------------- ++ * tdp | tdp | Lock clock to TDP ++ * unlimited | tdp | Upper bound is TDP but clock may drift below this ++ * tdp | unlimited | Lower bound is TDP but clock may boost above this ++ * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) ++ * ++ * If one arg takes one of these values, the other must be one of these values as ++ * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. ++ * ++ * Requires root/admin permissions. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetGpuLockedClocks. ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minGpuClockMHz Requested minimum gpu clock in MHz ++ * @param maxGpuClockMHz Requested maximum gpu clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); ++ ++/** ++ * Resets the gpu clock to the default value ++ * ++ * This is the gpu clock that will be used after system reboot or driver reload. ++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * @see nvmlDeviceSetGpuLockedClocks ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); ++ ++/** ++ * Set memory clocks that device will lock to. ++ * ++ * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. ++ * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. ++ * See /ref nvmlDeviceSetApplicationsClocks ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * Requires root/admin permissions. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetMemoryLockedClocks. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minMemClockMHz Requested minimum memory clock in MHz ++ * @param maxMemClockMHz Requested maximum memory clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); ++ ++/** ++ * Resets the memory clock to the default value ++ * ++ * This is the memory clock that will be used after system reboot or driver reload. ++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * @see nvmlDeviceSetMemoryLockedClocks ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); ++ ++/** ++ * Set clocks that applications will lock to. ++ * ++ * Sets the clocks that compute and graphics applications will be running at. ++ * e.g. CUDA driver requests these clocks during context creation which means this property ++ * defines clocks at which CUDA applications will be running unless some overspec event ++ * occurs (e.g. over power, over thermal or external HW brake). ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. ++ * ++ * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call ++ * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting ++ * above the clock value being set. ++ * ++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks ++ * for details on how to list available clocks combinations. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetApplicationsClocks. ++ * ++ * @param device The identifier of the target device ++ * @param memClockMHz Requested memory clock in MHz ++ * @param graphicsClockMHz Requested graphics clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); ++ ++/** ++ * Resets the application clock to the default value ++ * ++ * This is the applications clock that will be used after system reboot or driver reload. ++ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, ++ * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above ++ * base clocks as thermal limits allow. ++ * ++ * @see nvmlDeviceGetApplicationsClock ++ * @see nvmlDeviceSetApplicationsClocks ++ * ++ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); ++ ++/** ++ * Try to set the current state of Auto Boosted clocks on a device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock ++ * rates are desired. ++ * ++ * Non-root users may use this API by default but can be restricted by root from using this API by calling ++ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. ++ * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. ++ * ++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param enabled What state to try to set Auto Boosted clocks of the target device to ++ * ++ * @return ++ * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); ++ ++/** ++ * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will ++ * return to when no compute running processes (e.g. CUDA application which have an active context) are running ++ * ++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * Requires root/admin permissions. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock ++ * rates are desired. ++ * ++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param enabled What state to try to set default Auto Boosted clocks of the target device to ++ * @param flags Flags that change the default behavior. Currently Unused. ++ * ++ * @return ++ * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); ++ ++/** ++ * Sets the speed of the fan control policy to default. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the fan, starting at zero ++ * ++ * return ++ * NVML_SUCCESS if speed has been adjusted ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if device is invalid ++ * NVML_ERROR_NOT_SUPPORTED if the device does not support this ++ * (doesn't have fans) ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); ++ ++/** ++ * Sets current fan control policy. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Requires privileged user. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * device The identifier of the target \a device ++ * policy The fan control \a policy to set ++ * ++ * return ++ * NVML_SUCCESS if \a policy has been set ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference ++ * a fan that exists. ++ * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, ++ nvmlFanControlPolicy_t policy); ++ ++/** ++ * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. ++ * ++ * @param device The identifier of the target device ++ * @param thresholdType The type of threshold value to be set ++ * @param temp Reference which hold the value to be set ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); ++ ++/** ++ * Set new power limit of this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. ++ * ++ * \note Limit is not persistent across reboots or driver unloads. ++ * Enable persistent mode to prevent driver from unloading when no application is using the device. ++ * ++ * @param device The identifier of the target device ++ * @param limit Power management limit in milliwatts to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetPowerManagementLimitConstraints ++ * @see nvmlDeviceGetPowerManagementDefaultLimit ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); ++ ++/** ++ * Sets new GOM. See \a nvmlGpuOperationMode_t for details. ++ * ++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. ++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. ++ * Not supported on Quadro ® and Tesla &tm; C-class products. ++ * Requires root/admin permissions. ++ * ++ * Changing GOMs requires a reboot. ++ * The reboot requirement might be removed in the future. ++ * ++ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when ++ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. ++ * ++ * @param device The identifier of the target device ++ * @param mode Target GOM ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlGpuOperationMode_t ++ * @see nvmlDeviceGetGpuOperationMode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); ++ ++/** ++ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. ++ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. ++ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction ++ * to query the current restriction settings. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @param device The identifier of the target device ++ * @param apiType Target API type for this operation ++ * @param isRestricted The target restriction ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isRestricted has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support ++ * the feature that api restrictions are being set for (E.G. Enabling/disabling auto ++ * boosted clocks is not supported by the device) ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlRestrictedAPI_t ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); ++ ++/** ++ * Sets the speed of a specified fan. ++ * ++ * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor ++ * the temperature and adjust the fan speed accordingly. ++ * If you set the fan speed too low you can burn your GPU! ++ * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. ++ * ++ * For all cuda-capable discrete products with fans that are Maxwell or Newer. ++ * ++ * device The identifier of the target device ++ * fan The index of the fan, starting at zero ++ * speed The target speed of the fan [0-100] in % of max speed ++ * ++ * return ++ * NVML_SUCCESS if the fan speed has been set ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, ++ * or if the fan index doesn't reference an actual fan. ++ * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. ++ * NVML_ERROR_UNKNOWN if there was an unexpected error. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); ++ ++/** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works ++ * on Maxwell onwards GPU architectures. ++ * ++ * Set the GPCCLK VF offset value ++ * @param[in] device The identifier of the target device ++ * @param[in] offset The GPCCLK VF offset value to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); ++ ++/** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works ++ * on Maxwell onwards GPU architectures. ++ * ++ * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. ++ * @param[in] device The identifier of the target device ++ * @param[in] offset The MemClk VF offset value to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); ++ ++/** ++ * @} ++ */ ++ ++/** @addtogroup nvmlAccountingStats ++ * @{ ++ */ ++ ++/** ++ * Enables or disables per process accounting. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @note This setting is not persistent and will default to disabled after driver unloads. ++ * Enable persistence mode to be sure the setting doesn't switch off to disabled. ++ * ++ * @note Enabling accounting mode has no negative impact on the GPU performance. ++ * ++ * @note Disabling accounting clears all accounting pids information. ++ * ++ * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. ++ * ++ * See \ref nvmlDeviceGetAccountingMode ++ * See \ref nvmlDeviceGetAccountingStats ++ * See \ref nvmlDeviceClearAccountingPids ++ * ++ * @param device The identifier of the target device ++ * @param mode The target accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the new mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); ++ ++/** ++ * Clears accounting information about all processes that have already terminated. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetAccountingMode ++ * See \ref nvmlDeviceGetAccountingStats ++ * See \ref nvmlDeviceSetAccountingMode ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if accounting information has been cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup NvLink NvLink Methods ++ * This chapter describes methods that NVML can perform on NVLINK enabled devices. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves the state of the device's NvLink for the link specified ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that ++ * the link is active and NVML_FEATURE_DISABLED indicates it ++ * is inactive ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isActive has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ++ ++/** ++ * Retrieves the version of the device's NvLink for the link specified ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param version Requested NvLink version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); ++ ++/** ++ * Retrieves the requested capability from the device's NvLink for the link specified ++ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried ++ * The return value should be treated as a boolean. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried ++ * @param capResult A boolean for the queried capability indicating that feature is available ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a capResult has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, ++ nvmlNvLinkCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieves the PCI information for the remote node on a NvLink link ++ * Note: pciSubSystemId is not filled in this function and is indeterminate ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param pci \a nvmlPciInfo_t of the remote node for the specified link ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ++ ++/** ++ * Retrieves the specified error counter value ++ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the NvLink counter to be queried ++ * @param counterValue Returned counter value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a counter has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, ++ nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); ++ ++/** ++ * Resets all error counters to zero ++ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the reset is successful ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); ++ ++/** ++ * Deprecated: Setting utilization counter control is no longer supported. ++ * ++ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. ++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset ++ * of the counters if the reset parameter is non-zero. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param counter Specifies the counter that should be set (0 or 1). ++ * @param link Specifies the NvLink link to be queried ++ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set ++ * @param reset Resets the counters on set if non-zero ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the control has been set successfully ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ nvmlNvLinkUtilizationControl_t *control, unsigned int reset); ++ ++/** ++ * Deprecated: Getting utilization counter control is no longer supported. ++ * ++ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. ++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param counter Specifies the counter that should be set (0 or 1). ++ * @param link Specifies the NvLink link to be queried ++ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the control has been set successfully ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ nvmlNvLinkUtilizationControl_t *control); ++ ++ ++/** ++ * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. ++ * ++ * Retrieve the NVLINK utilization counter based on the current control for a specified counter. ++ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl ++ * before reading the utilization counters as they have no default state ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the counter that should be read (0 or 1). ++ * @param rxcounter Receive counter return value ++ * @param txcounter Transmit counter return value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ unsigned long long *rxcounter, unsigned long long *txcounter); ++ ++/** ++ * Deprecated: Freezing NVLINK utilization counters is no longer supported. ++ * ++ * Freeze the NVLINK utilization counters ++ * Both the receive and transmit counters are operated on by this function ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the counter that should be frozen (0 or 1). ++ * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters ++ * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, ++ unsigned int counter, nvmlEnableState_t freeze); ++ ++/** ++ * Deprecated: Resetting NVLINK utilization counters is no longer supported. ++ * ++ * Reset the NVLINK utilization counters ++ * Both the receive and transmit counters are operated on by this function ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be reset ++ * @param counter Specifies the counter that should be reset (0 or 1) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); ++ ++/** ++* Get the NVLink device type of the remote device connected over the given link. ++* ++* @param device The device handle of the target GPU ++* @param link The NVLink link index on the target GPU ++* @param pNvLinkDeviceType Pointer in which the output remote device type is returned ++* ++* @return ++* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or ++* \a pNvLinkDeviceType is NULL ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is ++* otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); ++ ++/** ++ * Set NvLink Low Power Threshold for device. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param info Reference to \a nvmlNvLinkPowerThres_t struct ++ * input parameters ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the \a Threshold is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a Threshold is not within range ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * ++ **/ ++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info); ++ ++/** ++ * Set the global nvlink bandwith mode ++ * ++ * @param nvlinkBwMode nvlink bandwidth mode ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided ++ * - \ref NVML_ERROR_IN_USE if P2P object exists ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. ++ * - \ref NVML_ERROR_NO_PERMISSION if not root user ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetNvlinkBwMode(unsigned int nvlinkBwMode); ++ ++/** ++ * Get the global nvlink bandwith mode ++ * ++ * @param nvlinkBwMode reference of nvlink bandwidth mode ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. ++ * - \ref NVML_ERROR_NO_PERMISSION if not root user ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetNvlinkBwMode(unsigned int *nvlinkBwMode); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlEvents Event Handling Methods ++ * This chapter describes methods that NVML can perform against each device to register and wait for ++ * some event to occur. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Create an empty set of events. ++ * Event set should be freed by \ref nvmlEventSetFree ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * @param set Reference in which to return the event handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventSetFree ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); ++ ++/** ++ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) ++ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) ++ * ++ * For Linux only. ++ * ++ * \b IMPORTANT: Operations on \a set are not thread safe ++ * ++ * This call starts recording of events on specific device. ++ * All events that occurred before this call are not recorded. ++ * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 ++ * ++ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. ++ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes ++ * are registered in that case. ++ * ++ * @param device The identifier of the target device ++ * @param eventTypes Bitmask of \ref nvmlEventType to record ++ * @param set Set to which add new event types ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceGetSupportedEventTypes ++ * @see nvmlEventSetWait ++ * @see nvmlEventSetFree ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); ++ ++/** ++ * Returns information about events supported on device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. ++ * ++ * @param device The identifier of the target device ++ * @param eventTypes Reference in which to return bitmask of supported events ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the eventTypes has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); ++ ++/** ++ * Waits on events and delivers events ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * If some events are ready to be delivered at the time of the call, function returns immediately. ++ * If there are no events ready to be delivered, function sleeps till event arrives ++ * but not longer than specified timeout. This function in certain conditions can return before ++ * specified timeout passes (e.g. when interrupt arrives) ++ * ++ * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. ++ * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error ++ * type is returned for all xid error events. ++ * ++ * On Linux, every xid error event would return the associated event data and other information if applicable. ++ * ++ * In MIG mode, if device handle is provided, the API reports all the events for the available instances, ++ * only if the caller has appropriate privileges. In absence of required privileges, only the events which ++ * affect all the instances (i.e. whole device) are reported. ++ * ++ * This API does not currently support per-instance event reporting using MIG device handles. ++ * ++ * @param set Reference to set of events to wait on ++ * @param data Reference in which to return event data ++ * @param timeoutms Maximum amount of wait time in milliseconds for registered event ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the data has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL ++ * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived ++ * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); ++ ++/** ++ * Releases events in the set ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param set Reference to events to be released ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been successfully released ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlZPI Drain states ++ * This chapter describes methods that NVML can perform against each device to control their drain state ++ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to ++ * power on/off GPUs, enable robust reset scenarios, etc. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. ++ * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before ++ * this call is made. ++ * Must be called as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU drain state to be modified ++ * @param newState The drain state that should be entered, see \ref nvmlEnableState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation ++ * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); ++ ++/** ++ * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining ++ * state. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU drain state to be queried ++ * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); ++ ++/** ++ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver ++ * as long as no other processes are attached. If other processes are attached, this call will return ++ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the ++ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called ++ * to initiate the draining state is if that process was using, and is still using, a GPU before the ++ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled ++ * prior to this call. ++ * ++ * For long-running NVML processes please note that this will change the enumeration of current GPUs. ++ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. ++ * Also, device handles after the removed GPU will not be valid and must be re-established. ++ * Must be run as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU to be removed ++ * @param gpuState Whether the GPU is to be removed, from the OS ++ * see \ref nvmlDetachGpuState_t ++ * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); ++ ++/** ++ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that ++ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. ++ * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes ++ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. ++ * ++ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds ++ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. ++ * ++ * Must be run as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device ++ * fields are used in this call. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature ++ * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFieldValueQueries Field Value Queries ++ * This chapter describes NVML operations that are associated with retrieving Field Values from NVML ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. ++ * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs ++ * will be populated from a single call rather than making a driver call for each fieldId. ++ * ++ * @param device The device handle of the GPU to request field values for ++ * @param valuesCount Number of entries in values that should be retrieved ++ * @param values Array of \a valuesCount structures to hold field values. ++ * Each value's fieldId must be populated prior to this call ++ * ++ * @return ++ * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must ++ * check the nvmlReturn field of each value for each individual ++ * status ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ++ ++/** ++ * Clear values for a list of fields for a device. This API allows multiple fields to be cleared at once. ++ * ++ * @param device The device handle of the GPU to request field values for ++ * @param valuesCount Number of entries in values that should be cleared ++ * @param values Array of \a valuesCount structures to hold field values. ++ * Each value's fieldId must be populated prior to this call ++ * ++ * @return ++ * - \ref NVML_SUCCESS if any values in \a values were cleared. Note that you must ++ * check the nvmlReturn field of each value for each individual ++ * status ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVirtualGpuQueries vGPU APIs ++ * This chapter describes operations that are associated with NVIDIA vGPU Software products. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * This method is used to get the virtualization mode corresponding to the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device Identifier of the target device ++ * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pVirtualMode is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); ++ ++/** ++ * Queries if SR-IOV host operation is supported on a vGPU supported device. ++ * ++ * Checks whether SR-IOV host capability is supported by the device and the ++ * driver, and indicates device is in SR-IOV mode if both of these conditions ++ * are true. ++ * ++ * @param device The identifier of the target device ++ * @param pHostVgpuMode Reference in which to return the current vGPU mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. ++ * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); ++ ++/** ++ * This method is used to set the virtualization mode corresponding to the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device Identifier of the target device ++ * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a virtualMode is set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a virtualMode is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. ++ * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); ++ ++/** ++ * Get the vGPU heterogeneous mode for the device. ++ * ++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. ++ * ++ * On successful return, the function returns \a pHeterogeneousMode->mode with the current vGPU heterogeneous mode. ++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should ++ * set the correct version number to retrieve the vGPU heterogeneous mode. ++ * \a pHeterogeneousMode->mode can either be \ref NVML_FEATURE_ENABLED or \ref NVML_FEATURE_DISABLED. ++ * ++ * @param device The identifier of the target device ++ * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a pHeterogeneousMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support this feature ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice_t device, nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); ++ ++/** ++ * Enable or disable vGPU heterogeneous mode for the device. ++ * ++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. ++ * ++ * API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode ++ * set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API ++ * is expected to shutdown the vGPU VMs and retry setting the \a mode. ++ * On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode. ++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should ++ * set the correct version number to set the vGPU heterogeneous mode. ++ * ++ * @param device Identifier of the target device ++ * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid ++ * - \ref NVML_ERROR_IN_USE If the \a device is in use ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or \a device doesn't support this feature ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice_t device, const nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); ++ ++/** ++ * Query the placement ID of active vGPU instance. ++ * ++ * When in vGPU heterogeneous mode, this function returns a valid placement ID as \a pPlacement->placementId ++ * else NVML_INVALID_VGPU_PLACEMENT_ID is returned. ++ * \a pPlacement->version is the version number of the structure nvmlVgpuPlacementId_t, the caller should ++ * set the correct version number to get placement id of the vGPU instance \a vgpuInstance. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param pPlacement Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS If information is successfully retrieved ++ * - \ref NVML_ERROR_NOT_FOUND If \a vgpuInstance does not match a valid active vGPU instance ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuInstance is invalid or \a pPlacement is NULL ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacement is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuPlacementId_t *pPlacement); ++ ++/** ++ * Query the supported vGPU placement ID of the vGPU type. ++ * ++ * An array of supported vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the ++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be ++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). ++ * ++ * This function will return supported placement IDs even if GPU is not in vGPU heterogeneous mode. ++ * ++ * @param device Identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type. The vGPU type ID ++ * @param pPlacementList Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); ++ ++/** ++ * Query the creatable vGPU placement ID of the vGPU type. ++ * ++ * An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the ++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be ++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). ++ * The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the ++ * vGPU instance is running. ++ * ++ * The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type. The vGPU type ID ++ * @param pPlacementList Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); ++ ++/** ++ * Retrieve the static GSP heap size of the vGPU type in bytes ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param gspHeapSize Reference to return the GSP heap size value ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a gspHeapSize is NULL ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *gspHeapSize); ++ ++/** ++ * Retrieve the static framebuffer reservation of the vGPU type in bytes ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param fbReservation Reference to return the framebuffer reservation ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a fbReservation is NULL ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbReservation); ++ ++/** ++ * Set the desirable vGPU capability of a device ++ * ++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set. ++ * See \ref nvmlEnableState_t for available state. ++ * ++ * @param device The identifier of the target device ++ * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be set ++ * @param state The target capability mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a capability is invalid, or \a state is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state, or \a device not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, nvmlEnableState_t state); ++ ++/** ++ * Retrieve the vGPU Software licensable features. ++ * ++ * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) ++ * and their current license status. ++ * ++ * @param device Identifier of the target device ++ * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned ++ * ++ * @return ++ * - \ref NVML_SUCCESS if licensable features are successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVgpu vGPU Management ++ * @{ ++ * ++ * This chapter describes APIs supporting NVIDIA vGPU. ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieve the requested vGPU driver capability. ++ * ++ * Refer to the \a nvmlVgpuDriverCapability_t structure for the specific capabilities that can be queried. ++ * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability ++ * is supported. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param capability Specifies the \a nvmlVgpuDriverCapability_t to be queried ++ * @param capResult A boolean for the queried capability indicating that feature is supported ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capability is invalid, or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a devices not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the requested vGPU capability for GPU. ++ * ++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried. ++ * The return value in \a capResult reports a non-zero value indicating that the capability ++ * is supported, and also reports the capability's data based on the queried capability. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried ++ * @param capResult Specifies that the queried capability is supported, and also returns capability's data ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a capability is invalid, or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a device not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the supported vGPU types on a physical GPU (device). ++ * ++ * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer ++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount ++ * is used to return the number of vGPU types written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. ++ * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. ++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types ++ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); ++ ++/** ++ * Retrieve the currently creatable vGPU types on a physical GPU (device). ++ * ++ * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer ++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount ++ * is used to return the number of vGPU types written to the buffer. ++ * ++ * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types ++ * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable ++ * list will be restricted to whatever vGPU type is already running on the device. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. ++ * To query the number of vGPU types that can be created for the GPU, call this function with *vgpuCount = 0. ++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types ++ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); ++ ++/** ++ * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeClass Pointer to string array to return class in ++ * @param size Size of string ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); ++ ++/** ++ * Retrieve the vGPU type name. ++ * ++ * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not ++ * exceed 64 characters in length (including the NUL terminator). See \ref ++ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeName Pointer to buffer to return name ++ * @param size Size of buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); ++ ++/** ++ * Retrieve the GPU Instance Profile ID for the given vGPU type ID. ++ * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is ++ * returned. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param gpuInstanceProfileId GPU Instance Profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); ++ ++/** ++ * Retrieve the device ID of a vGPU type. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value ++ * @param subsystemID Subsystem ID and subsystem vendor ID of the device contained in single 32 bit value ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); ++ ++/** ++ * Retrieve the vGPU framebuffer size in bytes. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param fbSize Pointer to framebuffer size in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); ++ ++/** ++ * Retrieve count of vGPU's supported display heads. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param numDisplayHeads Pointer to number of display heads ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); ++ ++/** ++ * Retrieve vGPU display head's maximum supported resolution. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param displayIndex Zero-based index of display head ++ * @param xdim Pointer to maximum number of pixels in X dimension ++ * @param ydim Pointer to maximum number of pixels in Y dimension ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex ++ * is out of range. ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); ++ ++/** ++ * Retrieve license requirements for a vGPU type ++ * ++ * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form ++ * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, ++ * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". ++ * ++ * The total length of the returned string will not exceed 128 characters, including the NUL terminator. ++ * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeLicenseString Pointer to buffer to return license info ++ * @param size Size of \a vgpuTypeLicenseString buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); ++ ++/** ++ * Retrieve the static frame rate limit value of the vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param frameRateLimit Reference to return the frame rate limit value ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); ++ ++/** ++ * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuInstanceCount Pointer to get the max number of vGPU instances ++ * that can be created on a deicve for given vgpuTypeId ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, ++ * or \a vgpuInstanceCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); ++ ++/** ++ * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); ++ ++/** ++ * Retrieve the BAR1 info for given vGPU type. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param bar1Info Pointer to the vGPU type BAR1 information structure \a nvmlVgpuTypeBar1Info_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a bar1Info is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetBAR1Info(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuTypeBar1Info_t *bar1Info); ++ ++/** ++ * Retrieve the active vGPU instances on a device. ++ * ++ * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The ++ * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. ++ * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return ++ * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer which passes in the array size as well as get ++ * back the number of types ++ * @param vgpuInstances Pointer to array in which to return list of vGPU instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); ++ ++/** ++ * Retrieve the VM ID associated with a vGPU instance. ++ * ++ * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param vmId Pointer to caller-supplied buffer to hold VM ID ++ * @param size Size of buffer in bytes ++ * @param vmIdType Pointer to hold VM ID type ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); ++ ++/** ++ * Retrieve the UUID of a vGPU instance. ++ * ++ * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, ++ * not exceeding 80 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID ++ * @param size Size of buffer in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); ++ ++/** ++ * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. ++ * ++ * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version ++ * string will not exceed 80 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is ++ * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the ++ * NVIDIA driver is loaded and initialized. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param version Caller-supplied buffer to return driver version string ++ * @param length Size of \a version buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); ++ ++/** ++ * Retrieve the framebuffer usage in bytes. ++ * ++ * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance The identifier of the target instance ++ * @param fbUsage Pointer to framebuffer usage in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); ++ ++/** ++ * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. ++ * ++ * Retrieve the current licensing state of the vGPU instance. ++ * ++ * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param licensed Reference to return the licensing status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a licensed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); ++ ++/** ++ * Retrieve the vGPU type of a vGPU instance. ++ * ++ * Returns the vGPU type ID of vgpu assigned to the vGPU instance. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param vgpuTypeId Reference to return the vgpuTypeId ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a vgpuTypeId has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); ++ ++/** ++ * Retrieve the frame rate limit set for the vGPU instance. ++ * ++ * Returns the value of the frame rate limit set for the vGPU instance ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param frameRateLimit Reference to return the frame rate limit ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a frameRateLimit has been set ++ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); ++ ++/** ++ * Retrieve the current ECC mode of vGPU instance. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param eccMode Reference in which to return the current ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); ++ ++/** ++ * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param encoderCapacity Reference to an unsigned int for the encoder capacity ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity has been retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); ++ ++/** ++ * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param encoderCapacity Unsigned int for the encoder capacity value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); ++ ++/** ++ * Retrieves the current encoder statistics of a vGPU Instance ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param sessionCount Reference to an unsigned int for count of active encoder sessions ++ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions ++ * @param averageLatency Reference to an unsigned int for encode latency in microseconds ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL ++ * or \a vgpuInstance is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, ++ unsigned int *averageFps, unsigned int *averageLatency); ++ ++/** ++ * Retrieves information about all active encoder sessions on a vGPU Instance. ++ * ++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. ++ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return ++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param sessionCount Reference to caller supplied array size, and returns ++ * the number of sessions. ++ * @param sessionInfo Reference to caller supplied array in which the list ++ * of session information us returned. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionInfo is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is ++ returned in \a sessionCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); ++ ++/** ++* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats ++* ++* @return ++* - \ref NVML_SUCCESS if \a fbcStats is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); ++ ++/** ++* Retrieves information about active frame buffer capture sessions on a vGPU Instance. ++* ++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++* written to the buffer. ++* ++* If the supplied buffer is not large enough to accommodate the active session array, the function returns ++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. ++* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return ++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may ++* be zero if there are no new frames captured since the session started. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++* @param sessionInfo Reference in which to return the session information ++* ++* @return ++* - \ref NVML_SUCCESS if \a sessionInfo is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); ++ ++/** ++* Retrieve the GPU Instance ID for the given vGPU Instance. ++* The API will return a valid GPU Instance ID for MIG backed vGPU Instance, else INVALID_GPU_INSTANCE_ID is returned. ++* ++* For Kepler &tm; or newer fully supported devices. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param gpuInstanceId GPU Instance ID ++* ++* @return ++* - \ref NVML_SUCCESS successful completion ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a gpuInstanceId is NULL. ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int *gpuInstanceId); ++ ++/** ++* Retrieves the PCI Id of the given vGPU Instance i.e. the PCI Id of the GPU as seen inside the VM. ++* ++* The vGPU PCI id is returned as "00000000:00:00.0" if NVIDIA driver is not installed on the vGPU instance. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param vgpuPciId Caller-supplied buffer to return vGPU PCI Id string ++* @param length Size of the vgpuPciId buffer ++* ++* @return ++* - \ref NVML_SUCCESS if vGPU PCI Id is sucessfully retrieved ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuPciId is NULL ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small, \a length is set to required length ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length); ++ ++/** ++* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure ++* for the specific capabilities that can be queried. The return value in \a capResult should be treated as ++* a boolean, with a non-zero value indicating that the capability is supported. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param vgpuTypeId Handle to vGPU type ++* @param capability Specifies the \a nvmlVgpuCapability_t to be queried ++* @param capResult A boolean for the queried capability indicating that feature is supported ++* ++* @return ++* - \ref NVML_SUCCESS successful completion ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the MDEV UUID of a vGPU instance. ++ * ++ * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, ++ * not exceeding 80 characters in length (including the NULL terminator). ++ * MDEV UUID is displayed only on KVM platform. ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID ++ * @param size Size of buffer in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvml vGPU Migration ++ * This chapter describes operations that are associated with vGPU Migration. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Structure representing range of vGPU versions. ++ */ ++typedef struct nvmlVgpuVersion_st ++{ ++ unsigned int minVersion; //!< Minimum vGPU version. ++ unsigned int maxVersion; //!< Maximum vGPU version. ++} nvmlVgpuVersion_t; ++ ++/** ++ * vGPU metadata structure. ++ */ ++typedef struct nvmlVgpuMetadata_st ++{ ++ unsigned int version; //!< Current version of the structure ++ unsigned int revision; //!< Current revision of the structure ++ nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields ++ char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest ++ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host ++ unsigned int reserved[6]; //!< Reserved for internal use ++ unsigned int vgpuVirtualizationCaps; //!< vGPU virtualization capabilities bitfield ++ unsigned int guestVgpuVersion; //!< vGPU version of guest driver ++ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes ++ char opaqueData[4]; //!< Opaque data ++} nvmlVgpuMetadata_t; ++ ++/** ++ * Physical GPU metadata structure ++ */ ++typedef struct nvmlVgpuPgpuMetadata_st ++{ ++ unsigned int version; //!< Current version of the structure ++ unsigned int revision; //!< Current revision of the structure ++ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version ++ unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualization capabilities bitfield ++ unsigned int reserved[5]; //!< Reserved for internal use ++ nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver ++ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes ++ char opaqueData[4]; //!< Opaque data ++} nvmlVgpuPgpuMetadata_t; ++ ++/** ++ * vGPU VM compatibility codes ++ */ ++typedef enum nvmlVgpuVmCompatibility_enum ++{ ++ NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable ++ NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) ++ NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) ++ NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) ++ NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 //!< vGPU is runnable from a live/paused (ACPI S0) ++} nvmlVgpuVmCompatibility_t; ++ ++/** ++ * vGPU-pGPU compatibility limit codes ++ */ ++typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum ++{ ++ NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. ++ NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< ompatibility is limited by host driver version. ++ NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. ++ NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. ++ NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 //!< Compatibility is limited by an undefined factor. ++} nvmlVgpuPgpuCompatibilityLimitCode_t; ++ ++/** ++ * vGPU-pGPU compatibility structure ++ */ ++typedef struct nvmlVgpuPgpuCompatibility_st ++{ ++ nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t ++ nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t ++} nvmlVgpuPgpuCompatibility_t; ++ ++/** ++ * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM ++ * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section ++ * containing internal state. ++ * ++ * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are ++ * dependent on information obtained from the guest VM, which may not yet have reached a state where that information ++ * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field. ++ * ++ * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide ++ * it to Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. ++ * ++ * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure ++ * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param vgpuInstance vGPU instance handle ++ * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written ++ * @param bufferSize Size of vgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about ++ * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section ++ * containing internal state. ++ * ++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata ++ * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param device The identifier of the target device ++ * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written ++ * @param bufferSize Pointer to size of \a pgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS GPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a ++ * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the ++ * physical GPU. ++ * ++ * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The ++ * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility ++ * with the physical GPU is limited, a limit code indicates the factor limiting compatability. ++ * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). ++ * ++ * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to ++ * boot a given vGPU or associated VM. ++ * ++ * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure ++ * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure ++ * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); ++ ++/** ++ * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. ++ * ++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the ++ * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param device The identifier of the target device ++ * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written ++ * @param bufferSize Pointer to size of \a pgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS GPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Returns the vGPU Software scheduler logs. ++ * \a pSchedulerLog points to a caller-allocated structure to contain the logs. The number of elements returned will ++ * never exceed \a NVML_SCHEDULER_SW_MAX_LOG_ENTRIES. ++ * ++ * To get the entire logs, call the function atleast 5 times a second. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerLog Reference in which \a pSchedulerLog is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler logs were successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerLog is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t *pSchedulerLog); ++ ++/** ++ * Returns the vGPU scheduler state. ++ * The information returned in \a nvmlVgpuSchedulerGetState_t is not relevant if the BEST EFFORT policy is set. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerState Reference in which \a pSchedulerState is returned ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler state is successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState); ++ ++/** ++ * Returns the vGPU scheduler capabilities. ++ * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from ++ * the NVML_VGPU_SCHEDULER_POLICY_*. This list enumerates the supported scheduler policies ++ * if the engine is Graphics type. ++ * The other values in \a nvmlVgpuSchedulerCapabilities_t are also applicable if the engine is ++ * Graphics type. For other engine types, it is BEST EFFORT policy. ++ * If ARR is supported and enabled, scheduling frequency and averaging factor are applicable ++ * else timeSlice is applicable. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pCapabilities Reference in which \a pCapabilities is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler capabilities were successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pCapabilities is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities); ++ ++/** ++ * Sets the vGPU scheduler state. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * The scheduler state change won't persist across module load/unload. ++ * Scheduler state and params will be allowed to set only when no VM is running. ++ * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode is enabled then ++ * provide avgFactorForARR and frequency as input. If enableARRMode is disabled ++ * then provide timeslice as input. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerState vGPU \a pSchedulerState to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid ++ * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, ++ * reboot is required to overcome from this error. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * or if any vGPU instance currently exists on the \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); ++ ++/* ++ * Virtual GPU (vGPU) version ++ * ++ * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that ++ * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager ++ * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. ++ * ++ * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver ++ * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. ++ */ ++ ++/** ++ * Query the ranges of supported vGPU versions. ++ * ++ * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. ++ * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. ++ * ++ * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: ++ * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. ++ * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. ++ * ++ * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written ++ * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. ++ * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. ++ */ ++nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); ++ ++/** ++ * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. ++ * ++ * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the ++ * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to ++ * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. ++ * ++ * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. ++ * ++ * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. ++ * ++ * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. ++ * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. ++ * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated ++ * vGPU version that is the maximum value in the overlapping range. ++ * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. ++ * ++ * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. ++ * ++ * @return ++ * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. ++ * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. ++ */ ++nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUtil vGPU Utilization and Accounting ++ * This chapter describes operations that are associated with vGPU Utilization and Accounting. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves current utilization for vGPUs on a physical GPU (device). ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running ++ * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer ++ * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the ++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values ++ * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to ++ * indicate the returned value type. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate ++ * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with ++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample ++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or ++ * destroyed. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values ++ * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances ++ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is ++ * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all ++ * vGPU instances currently executing on the device ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, ++ nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, ++ nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); ++ ++/** ++ * Retrieves recent utilization for vGPU instances running on a physical GPU (device). ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for vGPU ++ * instances running on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied ++ * buffer pointed at by \a vgpuUtilInfo->vgpuUtilArray. One utilization sample structure is returned per vGPU instance, and includes the ++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values ++ * in nvmlValue_t unions. The function sets the caller-supplied \a vgpuUtilInfo->sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to ++ * indicate the returned value type. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a vgpuUtilInfo->vgpuUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuUtilInfo->vgpuInstanceCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate ++ * a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t). Invoke the function again with ++ * the allocated buffer passed in \a vgpuUtilInfo->vgpuUtilArray, and \a vgpuUtilInfo->vgpuInstanceCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuUtilInfo->vgpuInstanceCount with the number of vGPU utilization sample ++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or ++ * destroyed. ++ * ++ * \a vgpuUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a vgpuUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param vgpuUtilInfo Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0 ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a vgpuUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small. ++ * The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call ++ * the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t) ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice_t device, ++ nvmlVgpuInstancesUtilizationInfo_t *vgpuUtilInfo); ++ ++/** ++ * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on ++ * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the ++ * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running ++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which ++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size ++ * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with ++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample ++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active ++ * in any given sample period. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances ++ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is ++ * passed with a non-NULL \a utilizationSamples ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all ++ * vGPU instances currently executing on the device ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, ++ unsigned int *vgpuProcessSamplesCount, ++ nvmlVgpuProcessUtilizationSample_t *utilizationSamples); ++ ++/** ++ * Retrieves recent utilization for processes running on vGPU instances on a physical GPU (device). ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for processes running ++ * on vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied ++ * buffer pointed at by \a vgpuProcUtilInfo->vgpuProcUtilArray. One utilization sample structure is returned per process running ++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which ++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a vgpuProcUtilInfo->vgpuProcUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current processes' count ++ * running on vGPU instances in \a vgpuProcUtilInfo->vgpuProcessCount. The caller should allocate a buffer of size ++ * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed ++ * in \a vgpuProcUtilInfo->vgpuProcUtilArray, and \a vgpuProcUtilInfo->vgpuProcessCount set to the number of entries the buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuProcUtilInfo->vgpuProcessCount with the number of vGPU sub process utilization sample ++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active ++ * in any given sample period. ++ * ++ * vgpuProcUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set vgpuProcUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param vgpuProcUtilInfo Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuProcUtilInfo is null ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a vgpuProcUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount ++ * is too small to return samples for all processes on vGPU instances currently executing on the device. ++ * The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount, ++ * and call the function again with a buffer of size ++ * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t) ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice_t device, nvmlVgpuProcessesUtilizationInfo_t *vgpuProcUtilInfo); ++ ++/** ++ * Queries the state of per process accounting mode on vGPU. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param mode Reference in which to return the current accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); ++ ++/** ++ * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes ++ * returned can be in running or terminated state. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * To just query the maximum number of processes that can be queried, call this function with *count = 0 and ++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. ++ * ++ * For more details see \ref nvmlVgpuInstanceGetAccountingStats. ++ * ++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param count Reference in which to provide the \a pids array size, and ++ * to return the number of elements ready to be queried ++ * @param pids Reference in which to return list of process ids ++ * ++ * @return ++ * - \ref NVML_SUCCESS if pids were successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlVgpuInstanceGetAccountingPids ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); ++ ++/** ++ * Queries process's accounting stats. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and ++ * can be queried during life time of the process or after its termination. ++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and ++ * updated to actual running time after its termination. ++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old ++ * processes. ++ * ++ * See \ref nvmlAccountingStats_t for description of each returned metric. ++ * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. ++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be ++ * queried since they don't contribute to GPU utilization. ++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param pid Process Id of the target process to query stats for ++ * @param stats Reference in which to return the process's accounting stats ++ * ++ * @return ++ * - \ref NVML_SUCCESS if stats have been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * or \a stats is not found ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); ++ ++/** ++ * Clears accounting information of the vGPU instance that have already terminated. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. ++ * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications ++ * stats don't contribute to GPU utilization. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS if accounting information has been cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); ++ ++/** ++ * Query the license information of the vGPU instance. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param licenseInfo Pointer to vGPU license information structure ++ * ++ * @return ++ * - \ref NVML_SUCCESS if information is successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licenseInfo is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlExcludedGpuQueries Excluded GPU Queries ++ * This chapter describes NVML operations that are associated with excluded GPUs. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Excluded GPU device information ++ **/ ++typedef struct nvmlExcludedDeviceInfo_st ++{ ++ nvmlPciInfo_t pciInfo; //!< The PCI information for the excluded GPU ++ char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the excluded GPU ++} nvmlExcludedDeviceInfo_t; ++ ++ /** ++ * Retrieves the number of excluded GPU devices in the system. ++ * ++ * For all products. ++ * ++ * @param deviceCount Reference in which to return the number of excluded devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceCount(unsigned int *deviceCount); ++ ++/** ++ * Acquire the device information for an excluded GPU device, based on its index. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a deviceCount returned by ++ * \ref nvmlGetExcludedDeviceCount(). For example, if \a deviceCount is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * @param index The index of the target GPU, >= 0 and < \a deviceCount ++ * @param info Reference in which to return the device information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL ++ * ++ * @see nvmlGetExcludedDeviceCount ++ */ ++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t *info); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management ++ * This chapter describes NVML operations that are associated with Multi Instance GPU management. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Disable Multi Instance GPU mode. ++ */ ++#define NVML_DEVICE_MIG_DISABLE 0x0 ++ ++/** ++ * Enable Multi Instance GPU mode. ++ */ ++#define NVML_DEVICE_MIG_ENABLE 0x1 ++ ++/** ++ * GPU instance profiles. ++ * ++ * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the ++ * detailed information about a GPU instance such as profile ID, engine counts. ++ */ ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 ++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 ++#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 ++#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 ++#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 ++#define NVML_GPU_INSTANCE_PROFILE_8_SLICE 0x5 ++#define NVML_GPU_INSTANCE_PROFILE_6_SLICE 0x6 ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7 ++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8 ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 ++#define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA ++ ++/** ++ * MIG GPU instance profile capability. ++ * ++ * Bit field values representing MIG profile capabilities ++ * \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities ++ */ ++#define NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P 0x1 ++ ++/** ++ * MIG compute instance profile capability. ++ * ++ * Bit field values representing MIG profile capabilities ++ * \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities ++ */ ++/* No capabilities for compute profiles currently exposed */ ++ ++typedef struct nvmlGpuInstancePlacement_st ++{ ++ unsigned int start; //!< Index of first occupied memory slice ++ unsigned int size; //!< Number of memory slices occupied ++} nvmlGpuInstancePlacement_t; ++ ++/** ++ * GPU instance profile information. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_st ++{ ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int isP2pSupported; //!< Peer-to-Peer support ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++} nvmlGpuInstanceProfileInfo_t; ++ ++/** ++ * GPU instance profile information (v2). ++ * ++ * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field ++ * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name ++ * field to the end. This structure is not backwards-compatible with ++ * \ref nvmlGpuInstanceProfileInfo_t. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_v2_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int isP2pSupported; //!< Peer-to-Peer support ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++} nvmlGpuInstanceProfileInfo_v2_t; ++ ++/** ++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. ++ */ ++#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) ++ ++/** ++ * GPU instance profile information (v3). ++ * ++ * Version 3 removes isP2pSupported field and adds the \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities ++ * field \ref nvmlGpuInstanceProfileInfo_t. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_v3_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v3) ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++ unsigned int capabilities; //!< Additional capabilities ++} nvmlGpuInstanceProfileInfo_v3_t; ++ ++/** ++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v3_t.version. ++ */ ++#define nvmlGpuInstanceProfileInfo_v3 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 3) ++ ++typedef struct nvmlGpuInstanceInfo_st ++{ ++ nvmlDevice_t device; //!< Parent device ++ unsigned int id; //!< Unique instance ID within the device ++ unsigned int profileId; //!< Unique profile ID within the device ++ nvmlGpuInstancePlacement_t placement; //!< Placement for this instance ++} nvmlGpuInstanceInfo_t; ++ ++typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; ++ ++/** ++ * Compute instance profiles. ++ * ++ * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the ++ * detailed information about a compute instance such as profile ID, engine counts ++ */ ++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 ++#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 ++#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 ++#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 ++#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 ++#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 ++#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 ++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 ++#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 ++ ++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared ++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 ++ ++typedef struct nvmlComputeInstancePlacement_st ++{ ++ unsigned int start; //!< Index of first occupied compute slice ++ unsigned int size; //!< Number of compute slices occupied ++} nvmlComputeInstancePlacement_t; ++ ++/** ++ * Compute instance profile information. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_st ++{ ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++} nvmlComputeInstanceProfileInfo_t; ++ ++/** ++ * Compute instance profile information (v2). ++ * ++ * Version 2 adds the \ref nvmlComputeInstanceProfileInfo_v2_t.version field ++ * to the start of the structure, and the \ref nvmlComputeInstanceProfileInfo_v2_t.name ++ * field to the end. This structure is not backwards-compatible with ++ * \ref nvmlComputeInstanceProfileInfo_t. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_v2_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v2) ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++} nvmlComputeInstanceProfileInfo_v2_t; ++ ++/** ++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v2_t.version. ++ */ ++#define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) ++ ++/** ++ * Compute instance profile information (v3). ++ * ++ * Version 3 adds the \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities field ++ * \ref nvmlComputeInstanceProfileInfo_t. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_v3_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v3) ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++ unsigned int capabilities; //!< Additional capabilities ++} nvmlComputeInstanceProfileInfo_v3_t; ++ ++/** ++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v3_t.version. ++ */ ++#define nvmlComputeInstanceProfileInfo_v3 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 3) ++ ++typedef struct nvmlComputeInstanceInfo_st ++{ ++ nvmlDevice_t device; //!< Parent device ++ nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance ++ unsigned int id; //!< Unique instance ID within the GPU instance ++ unsigned int profileId; //!< Unique profile ID within the GPU instance ++ nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} ++} nvmlComputeInstanceInfo_t; ++ ++typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; ++ ++/** ++ * Set MIG mode for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root user. ++ * ++ * This mode determines whether a GPU instance can be created. ++ * ++ * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the ++ * device, such as minor number, might change. The caller of this API is expected to query such attributes again. ++ * ++ * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM ++ * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. ++ * ++ * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device ++ * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API ++ * is expected to idle the device and retry setting the \a mode. ++ * ++ * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref ++ * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. ++ * ++ * @param device The identifier of the target device ++ * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * @param activationStatus The activationStatus status ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); ++ ++/** ++ * Get MIG mode for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the ++ * next activation trigger. ++ * ++ * @param device The identifier of the target device ++ * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); ++ ++/** ++ * Get GPU instance profile information ++ * ++ * Information provided by this API is immutable throughout the lifetime of a MIG mode. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, ++ nvmlGpuInstanceProfileInfo_t *info); ++ ++/** ++ * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned ++ * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. ++ * ++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the ++ * appropriate version prior to calling this function. For example: ++ * \code ++ * nvmlGpuInstanceProfileInfo_v2_t profileInfo = ++ * { .version = nvmlGpuInstanceProfileInfo_v2 }; ++ * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, ++ * profile, ++ * &profileInfo); ++ * \endcode ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, ++ nvmlGpuInstanceProfileInfo_v2_t *info); ++ ++/** ++ * Get GPU instance placements. ++ * ++ * A placement represents the location of a GPU instance within a device. This API only returns all the possible ++ * placements for the given profile regardless of whether MIG is enabled or not. ++ * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will ++ * fail if there is overlap with the already occupied memory slices. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param placements Returns placements allowed for the profile. Can be NULL to discover number ++ * of allowed placements for this profile. If non-NULL must be large enough ++ * to accommodate the placements supported by the profile. ++ * @param count Returns number of allowed placemenets for the profile. ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstancePlacement_t *placements, ++ unsigned int *count); ++ ++/** ++ * Get GPU instance profile capacity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param count Returns remaining instance count for the profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, ++ unsigned int *count); ++ ++/** ++ * Create GPU instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would ++ * become invalid. The GPU instance must be recreated to acquire a valid handle. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param gpuInstance Returns the GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstance_t *gpuInstance); ++ ++/** ++ * Create GPU instance with the specified placement. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would ++ * become invalid. The GPU instance must be recreated to acquire a valid handle. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 ++ * @param gpuInstance Returns the GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, ++ const nvmlGpuInstancePlacement_t *placement, ++ nvmlGpuInstance_t *gpuInstance); ++/** ++ * Destroy GPU instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes ++ * (e.g. CUDA application) or compute instances are active on the ++ * GPU instance. ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); ++ ++/** ++ * Get GPU instances for given profile ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to ++ * accommodate the instances supported by the profile. ++ * See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param count The count of returned GPU instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstance_t *gpuInstances, unsigned int *count); ++ ++/** ++ * Get GPU instances for given instance ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param id The GPU instance ID ++ * @param gpuInstance Returns GPU instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); ++ ++/** ++ * Get GPU instance information. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The GPU instance handle ++ * @param info Return GPU instance information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); ++ ++/** ++ * Get compute instance profile information. ++ * ++ * Information provided by this API is immutable throughout the lifetime of a MIG mode. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* ++ * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, ++ unsigned int engProfile, ++ nvmlComputeInstanceProfileInfo_t *info); ++ ++/** ++ * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned ++ * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. ++ * ++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the ++ * appropriate version prior to calling this function. For example: ++ * \code ++ * nvmlComputeInstanceProfileInfo_v2_t profileInfo = ++ * { .version = nvmlComputeInstanceProfileInfo_v2 }; ++ * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, ++ * profile, ++ * engProfile, ++ * &profileInfo); ++ * \endcode ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* ++ * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, ++ unsigned int engProfile, ++ nvmlComputeInstanceProfileInfo_v2_t *info); ++ ++/** ++ * Get compute instance profile capacity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param count Returns remaining instance count for the profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, ++ unsigned int profileId, unsigned int *count); ++ ++/** ++ * Get compute instance placements. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * A placement represents the location of a compute instance within a GPU instance. This API only returns all the possible ++ * placements for the given profile. ++ * A created compute instance occupies compute slices described by its placement. Creation of new compute instance will ++ * fail if there is overlap with the already occupied compute slices. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param placements Returns placements allowed for the profile. Can be NULL to discover number ++ * of allowed placements for this profile. If non-NULL must be large enough ++ * to accommodate the placements supported by the profile. ++ * @param count Returns number of allowed placemenets for the profile. ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, ++ unsigned int profileId, ++ nvmlComputeInstancePlacement_t *placements, ++ unsigned int *count); ++ ++/** ++ * Create compute instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed ++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire ++ * a valid handle. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param computeInstance Returns the compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Create compute instance with the specified placement. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed ++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire ++ * a valid handle. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param placement The requested placement. See \ref nvmlGpuInstanceGetComputeInstancePossiblePlacements ++ * @param computeInstance Returns the compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstanceWithPlacement(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ const nvmlComputeInstancePlacement_t *placement, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Destroy compute instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param computeInstance The compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if ++ * processes (e.g. CUDA application) are active on the compute instance. ++ */ ++nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); ++ ++/** ++ * Get compute instances for given profile ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to ++ * accommodate the instances supported by the profile. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param count The count of returned compute instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ nvmlComputeInstance_t *computeInstances, unsigned int *count); ++ ++/** ++ * Get compute instance for given instance ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param id The compute instance ID ++ * @param computeInstance Returns compute instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Get compute instance information. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param computeInstance The compute instance handle ++ * @param info Return compute instance information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); ++ ++/** ++ * Test if the given handle refers to a MIG device. ++ * ++ * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. ++ * These overloaded references can be used (with some restrictions) interchangeably ++ * with a GPU device handle to execute queries at a per-compute instance granularity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device NVML handle to test ++ * @param isMigDevice True when handle refers to a MIG device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device status was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); ++ ++/** ++ * Get GPU instance ID for the given MIG device handle. ++ * ++ * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target MIG device handle ++ * @param id GPU instance ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if instance ID was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); ++ ++/** ++ * Get compute instance ID for the given MIG device handle. ++ * ++ * Compute instance IDs are unique per GPU instance and remain valid until the compute instance ++ * is destroyed. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target MIG device handle ++ * @param id Compute instance ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if instance ID was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); ++ ++/** ++ * Get the maximum number of MIG devices that can exist under a given parent NVML device. ++ * ++ * Returns zero if MIG is not supported or enabled. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target device handle ++ * @param count Count of MIG devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); ++ ++/** ++ * Get MIG device handle for the given index under its parent NVML device. ++ * ++ * If the compute instance is destroyed either explicitly or by destroying, ++ * resetting or unbinding the parent GPU instance or the GPU device itself ++ * the MIG device handle would remain invalid and must be requested again ++ * using this API. Handles may be reused and their properties can change in ++ * the process. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Reference to the parent GPU device handle ++ * @param index Index of the MIG device ++ * @param migDevice Reference to the MIG device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a migDevice handle was successfully created ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, ++ nvmlDevice_t *migDevice); ++ ++/** ++ * Get parent device handle from a MIG device handle. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param migDevice MIG device handle ++ * @param device Device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device handle was successfully created ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); ++ ++/** @} */ // @defgroup nvmlMultiInstanceGPU ++ ++ ++/***************************************************************************************************/ ++/** @defgroup GPM NVML GPM ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmEnums GPM Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * GPM Metric Identifiers ++ */ ++typedef enum ++{ ++ NVML_GPM_METRIC_GRAPHICS_UTIL = 1, //!< Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 ++ NVML_GPM_METRIC_SM_UTIL = 2, //!< Percentage of SMs that were busy. 0.0 - 100.0 ++ NVML_GPM_METRIC_SM_OCCUPANCY = 3, //!< Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 ++ NVML_GPM_METRIC_INTEGER_UTIL = 4, //!< Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, //!< Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, //!< Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, //!< Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, //!< Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_DRAM_BW_UTIL = 10, //!< Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ ++ NVML_GPM_METRIC_FP64_UTIL = 11, //!< Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_FP32_UTIL = 12, //!< Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_FP16_UTIL = 13, //!< Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, //!< PCIe traffic from this GPU in MiB/sec ++ NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, //!< PCIe traffic to this GPU in MiB/sec ++ NVML_GPM_METRIC_NVDEC_0_UTIL = 30, //!< Percent utilization of NVDEC 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_1_UTIL = 31, //!< Percent utilization of NVDEC 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_2_UTIL = 32, //!< Percent utilization of NVDEC 2. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_3_UTIL = 33, //!< Percent utilization of NVDEC 3. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_4_UTIL = 34, //!< Percent utilization of NVDEC 4. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_5_UTIL = 35, //!< Percent utilization of NVDEC 5. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_6_UTIL = 36, //!< Percent utilization of NVDEC 6. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_7_UTIL = 37, //!< Percent utilization of NVDEC 7. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_0_UTIL = 40, //!< Percent utilization of NVJPG 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_1_UTIL = 41, //!< Percent utilization of NVJPG 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_2_UTIL = 42, //!< Percent utilization of NVJPG 2. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_3_UTIL = 43, //!< Percent utilization of NVJPG 3. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_4_UTIL = 44, //!< Percent utilization of NVJPG 4. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_5_UTIL = 45, //!< Percent utilization of NVJPG 5. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_6_UTIL = 46, //!< Percent utilization of NVJPG 6. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_7_UTIL = 47, //!< Percent utilization of NVJPG 7. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVOFA_0_UTIL = 50, //!< Percent utilization of NVOFA 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVOFA_1_UTIL = 51, //!< Percent utilization of NVOFA 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, //!< NvLink read bandwidth for all links in MiB/sec ++ NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, //!< NvLink write bandwidth for all links in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, //!< NvLink read bandwidth for link 0 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, //!< NvLink write bandwidth for link 0 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, //!< NvLink read bandwidth for link 1 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, //!< NvLink write bandwidth for link 1 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, //!< NvLink read bandwidth for link 2 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, //!< NvLink write bandwidth for link 2 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, //!< NvLink read bandwidth for link 3 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, //!< NvLink write bandwidth for link 3 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, //!< NvLink read bandwidth for link 4 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, //!< NvLink write bandwidth for link 4 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, //!< NvLink read bandwidth for link 5 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, //!< NvLink write bandwidth for link 5 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, //!< NvLink read bandwidth for link 6 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, //!< NvLink write bandwidth for link 6 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, //!< NvLink read bandwidth for link 7 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, //!< NvLink write bandwidth for link 7 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, //!< NvLink read bandwidth for link 8 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, //!< NvLink write bandwidth for link 8 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, //!< NvLink read bandwidth for link 9 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, //!< NvLink write bandwidth for link 9 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, //!< NvLink read bandwidth for link 10 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, //!< NvLink write bandwidth for link 10 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, //!< NvLink read bandwidth for link 11 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, //!< NvLink write bandwidth for link 11 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, //!< NvLink read bandwidth for link 12 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, //!< NvLink write bandwidth for link 12 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, //!< NvLink read bandwidth for link 13 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, //!< NvLink write bandwidth for link 13 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, //!< NvLink read bandwidth for link 14 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, //!< NvLink write bandwidth for link 14 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, //!< NvLink read bandwidth for link 15 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, //!< NvLink write bandwidth for link 15 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, //!< NvLink read bandwidth for link 16 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, //!< NvLink write bandwidth for link 16 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, //!< NvLink read bandwidth for link 17 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, //!< NvLink write bandwidth for link 17 in MiB/sec ++ //Put new metrics for BLACKWELL here... ++ NVML_GPM_METRIC_MAX = 98, //!< Maximum value above +1. Note that changing this should also change NVML_GPM_METRICS_GET_VERSION due to struct size change ++} nvmlGpmMetricId_t; ++ ++/** @} */ // @defgroup nvmlGpmEnums ++ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmStructs GPM Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc(). Free this with nvmlGpmSampleFree(). ++ */ ++typedef struct nvmlGpmSample_st* nvmlGpmSample_t; ++ ++/** ++ * GPM metric information. ++ */ ++typedef struct ++{ ++ unsigned int metricId; //!< IN: NVML_GPM_METRIC_? define of which metric to retrieve ++ nvmlReturn_t nvmlReturn; //!< OUT: Status of this metric. If this is nonzero, then value is not valid ++ double value; //!< OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) ++ struct ++ { ++ char *shortName; ++ char *longName; ++ char *unit; ++ } metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined ++} nvmlGpmMetric_t; ++ ++/** ++ * GPM buffer information. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< IN: Set to NVML_GPM_METRICS_GET_VERSION ++ unsigned int numMetrics; //!< IN: How many metrics to retrieve in metrics[] ++ nvmlGpmSample_t sample1; //!< IN: Sample buffer ++ nvmlGpmSample_t sample2; //!< IN: Sample buffer ++ nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; //!< IN/OUT: Array of metrics. Set metricId on call. See nvmlReturn and value on return ++} nvmlGpmMetricsGet_t; ++ ++#define NVML_GPM_METRICS_GET_VERSION 1 ++ ++/** ++ * GPM device information. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< IN: Set to NVML_GPM_SUPPORT_VERSION ++ unsigned int isSupportedDevice; //!< OUT: Indicates device support ++} nvmlGpmSupport_t; ++ ++#define NVML_GPM_SUPPORT_VERSION 1 ++ ++/** @} */ // @defgroup nvmlGPMStructs ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmFunctions GPM Functions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Calculate GPM metrics from two samples. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); ++ ++ ++/** ++ * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param gpmSample Sample to free ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); ++ ++ ++/** ++ * Allocate a sample buffer to be used with NVML GPM . You will need to allocate ++ * at least two of these buffers to use with the NVML GPM feature ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param gpmSample Where the allocated sample will be stored ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ * - \ref NVML_ERROR_MEMORY if system memory is insufficient ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); ++ ++/** ++ * Read a sample of GPM metrics into the provided \a gpmSample buffer. After ++ * two samples are gathered, you can call nvmlGpmMetricGet on those samples to ++ * retrive metrics ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device Device to get samples for ++ * @param gpmSample Buffer to read samples into ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); ++ ++/** ++ * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. ++ * ++ * After two samples are gathered, you can call nvmlGpmMetricGet on those ++ * samples to retrive metrics ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device Device to get samples for ++ * @param gpuInstanceId MIG GPU Instance ID ++ * @param gpmSample Buffer to read samples into ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); ++ ++/** ++ * Indicate whether the supplied device supports GPM ++ * ++ * @param device NVML device to query for ++ * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates ++ * GPM support per system for the supplied device ++ * ++ * @return ++ * - NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum if there is an error in processing the query ++ */ ++nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); ++ ++/* GPM Stream State */ ++/** ++ * Get GPM stream state. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param state Returns GPM stream state ++ * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current GPM stream state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a state is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlGpmQueryIfStreamingEnabled(nvmlDevice_t device, unsigned int *state); ++ ++/** ++ * Set GPM stream state. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param state GPM stream state, ++ * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current GPM stream state is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSetStreamingEnabled(nvmlDevice_t device, unsigned int state); ++ ++/** @} */ // @defgroup nvmlGpmFunctions ++/** @} */ // @defgroup GPM ++ ++#define NVML_DEV_CAP_EGM (1 << 0) // Extended GPU memory ++/** ++ * Device capabilities ++ */ ++typedef struct ++{ ++ unsigned int version; //!< the API version number ++ unsigned int capMask; //!< OUT: Bit mask of capabilities. ++} nvmlDeviceCapabilities_v1_t; ++typedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t; ++#define nvmlDeviceCapabilities_v1 NVML_STRUCT_VERSION(DeviceCapabilities, 1) ++ ++/** ++ * Get device capabilities ++ * ++ * See \ref nvmlDeviceCapabilities_v1_t for more information on the struct. ++ * ++ * @param device The identifier of the target device ++ * @param caps Returns GPU's capabilities ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the query is success ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCapabilities(nvmlDevice_t device, ++ nvmlDeviceCapabilities_t *caps); ++ ++/** ++ * NVML API versioning support ++ */ ++ ++#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS ++nvmlReturn_t DECLDIR nvmlInit(void); ++nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); ++nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); ++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); ++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); ++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); ++#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS ++ ++#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) ++// We don't define APIs to run new versions if this guard is present so there is ++// no need to undef ++#elif defined(__NVML_API_VERSION_INTERNAL) ++#undef nvmlDeviceGetGraphicsRunningProcesses ++#undef nvmlDeviceGetComputeRunningProcesses ++#undef nvmlDeviceGetMPSComputeRunningProcesses ++#undef nvmlDeviceGetAttributes ++#undef nvmlComputeInstanceGetInfo ++#undef nvmlEventSetWait ++#undef nvmlDeviceGetGridLicensableFeatures ++#undef nvmlDeviceRemoveGpu ++#undef nvmlDeviceGetNvLinkRemotePciInfo ++#undef nvmlDeviceGetPciInfo ++#undef nvmlDeviceGetCount ++#undef nvmlDeviceGetHandleByIndex ++#undef nvmlDeviceGetHandleByPciBusId ++#undef nvmlInit ++#undef nvmlBlacklistDeviceInfo_t ++#undef nvmlGetBlacklistDeviceCount ++#undef nvmlGetBlacklistDeviceInfoByIndex ++#undef nvmlDeviceGetGpuInstancePossiblePlacements ++#undef nvmlVgpuInstanceGetLicenseInfo ++#undef nvmlDeviceGetDriverModel ++#undef nvmlDeviceSetPowerManagementLimit ++ ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/contrib/nvml.py b/contrib/nvml.py +index 9f2c57d..2516979 100644 +--- a/contrib/nvml.py ++++ b/contrib/nvml.py +@@ -1,6 +1,7 @@ + import re ++import os + +-PATH="/usr/local/cuda/include/nvml.h" ++PATH=["./contrib/nvml.h", "/usr/local/cuda/include/nvml.h"] + func = ["nvmlInit", + "nvmlDeviceGetSupportedEventTypes", + "nvmlDeviceRegisterEvents", +@@ -22,7 +23,13 @@ type_pattern = re.compile( + flags=re.MULTILINE + ) + +-with open(PATH, 'r') as file: ++path="" ++if os.path.exists(PATH[0]) and os.access(PATH[0], os.R_OK): ++ path = PATH[0] ++else: ++ path = PATH[1] ++ ++with open(path, 'r') as file: + content = file.read() + matched_lines = pattern.findall(content) + type_lines = type_pattern.findall(content) +@@ -55,7 +62,7 @@ print(''' + ) + print('#include \ + \n#include \ +- \n#include "/usr/local/cuda/include/nvml.h"') ++ \n#include "{}"'.format(path)) + print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);') + print('\n'.join(func_declares)) + print('\nmy_nvmlErrorString_p my_nvmlErrorString;') +-- +2.43.5 + diff --git a/1024-anolis-do-not-print-teq-error.patch b/1024-anolis-do-not-print-teq-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..4fbe782120b8c768ea8e0a55945f60751fd50adc --- /dev/null +++ b/1024-anolis-do-not-print-teq-error.patch @@ -0,0 +1,50 @@ +From c6a9ca106c41e1f351849bce5d491bba3813cc10 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 17 Apr 2025 17:26:48 +0800 +Subject: [PATCH 24/30] anolis: do not print teq error + +Signed-off-by: Ruidong Tian +--- + ras-cxl-handler.c | 2 +- + ras-mce-handler.c | 6 +++--- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 575fff8..55509f1 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -718,7 +718,7 @@ static int handle_ras_cxl_common_hdr(struct trace_seq *s, + if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0) + return -1; + +- if (tep_get_field_val(s, event, "hdr_maint_op_sub_class", record, &val, 1) < 0) ++ if (tep_get_field_val(s, event, "hdr_maint_op_sub_class", record, &val, 0) < 0) + return -1; + hdr->hdr_maint_op_sub_class = val; + if (trace_seq_printf(s, "hdr_maint_op_sub_class:%u ", hdr->hdr_maint_op_sub_class) <= 0) +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index fc2e8d4..0f0d37f 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -571,15 +571,15 @@ int ras_mce_event_handler(struct trace_seq *s, + e.ipid = val; + + /* Get PPIN */ +- if (!tep_get_field_val(s, event, "ppin", record, &val, 1)) ++ if (!tep_get_field_val(s, event, "ppin", record, &val, 0)) + e.ppin = val; + + /* Get Microcode Revision */ +- if (!tep_get_field_val(s, event, "microcode", record, &val, 1)) ++ if (!tep_get_field_val(s, event, "microcode", record, &val, 0)) + e.microcode = val; + + /* Get Vendor-specfic Data, if any */ +- e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1); ++ e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 0); + + switch (mce->cputype) { + case CPU_GENERIC: +-- +2.43.5 + diff --git a/1025-anolis-add-init.sh-for-different-user.patch b/1025-anolis-add-init.sh-for-different-user.patch new file mode 100644 index 0000000000000000000000000000000000000000..90dd4af91b6a349ef2135d5b1b12e3753b974ef5 --- /dev/null +++ b/1025-anolis-add-init.sh-for-different-user.patch @@ -0,0 +1,104 @@ +From bec7414b742dc7164d7674a0eb9489c4723514ab Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 18 Apr 2025 15:43:57 +0800 +Subject: [PATCH 25/30] anolis: add init.sh for different user + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/rasdaemon.init | 26 ++++++++++++++++++++++++++ + misc/rasdaemon.spec.in | 18 ++++++++++++------ + 3 files changed, 39 insertions(+), 6 deletions(-) + create mode 100644 contrib/rasdaemon.init + +diff --git a/Makefile.am b/Makefile.am +index 4aba962..203b576 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -24,6 +24,7 @@ EXTRA_DIST = \ + $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ + misc/notices \ ++ contrib/rasdaemon.init \ + contrib/nvml.py \ + contrib/nvml.h \ + contrib/*_trigger +diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init +new file mode 100644 +index 0000000..d575af9 +--- /dev/null ++++ b/contrib/rasdaemon.init +@@ -0,0 +1,26 @@ ++#!/bin/sh ++target=$1 ++ENV_PATH="/etc/sysconfig/rasdaemon" ++ ++case "$target" in ++ ecs) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ ;; ++ ebs) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} ++ sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH} ++ sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH} ++ sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH} ++ ;; ++ jituan) ++ sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH} ++ sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH} ++ exit 1 ++ ;; ++ zhuanyou) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} ++ ;; ++ ++esac +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 23be188..bf4cc4b 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -61,6 +61,7 @@ install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{ + install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext + install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 contrib/%{name}.init %{buildroot}/usr/share/%{name}/%{name}.init + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -71,12 +72,13 @@ rm INSTALL %{buildroot}/usr/include/*.h + %{_unitdir}/*.service + %{_sysconfdir}/ras/dimm_labels.d + %{_sysconfdir}/ras/*/* +-%config(noreplace) %{_sysconfdir}/sysconfig/%{name} +-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng +-%config(noreplace) /usr/share/%{name}/%{name}.logrotate +-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog +-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext +-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext ++%{_sysconfdir}/sysconfig/%{name} ++/usr/share/%{name}/%{name}.syslog-ng ++/usr/share/%{name}/%{name}.logrotate ++/usr/share/%{name}/%{name}.rsyslog ++/usr/share/%{name}/%{name}.syslog-ng-ext ++/usr/share/%{name}/%{name}.rsyslog-ext ++/usr/share/%{name}/%{name}.init + %{_sysconfdir}/rasdaemon_notices/* + + %post +@@ -104,6 +106,10 @@ if ! systemctl is-enabled --quiet %{name}.service; then + echo "Rasdaemon service is not enabled, enable it"; + systemctl enable %{name}.service; + fi ++echo "Rasdaemon install for ${RASDAEMON_TARGET}"; ++/usr/share/%{name}/%{name}.init ${RASDAEMON_TARGET} ++ ++systemctl daemon-reload + systemctl restart %{name}.service + + %preun +-- +2.43.5 + diff --git a/1026-anolis-fix-systemd-config.patch b/1026-anolis-fix-systemd-config.patch new file mode 100644 index 0000000000000000000000000000000000000000..45770982f50740631b332e895af7bd2785fe6587 --- /dev/null +++ b/1026-anolis-fix-systemd-config.patch @@ -0,0 +1,30 @@ +From 09d282c32c52224af0b7310b24e6ddf4cd4efb61 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 18 Apr 2025 16:47:46 +0800 +Subject: [PATCH 26/30] anolis: fix systemd config + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.service.in | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in +index 0bb643f..c72b2d7 100644 +--- a/misc/rasdaemon.service.in ++++ b/misc/rasdaemon.service.in +@@ -7,10 +7,10 @@ Description=RAS daemon to log the RAS events + + [Service] + EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon +-ExecStart=@sbindir@/rasdaemon -f -r ++ExecStart=@sbindir@/rasdaemon -f + ExecStartPost=@sbindir@/rasdaemon --enable + ExecStop=@sbindir@/rasdaemon --disable +-Restart=on-abort ++Restart=always + + [Install] + WantedBy=multi-user.target +-- +2.43.5 + diff --git a/1027-anolis-add-nvgpu-driver.patch b/1027-anolis-add-nvgpu-driver.patch new file mode 100644 index 0000000000000000000000000000000000000000..ca4bc5c0dfa43357b0e20589b39f8630e69378bd --- /dev/null +++ b/1027-anolis-add-nvgpu-driver.patch @@ -0,0 +1,590 @@ +From ed059449efe2ce84e1c7cffdc5502430052c043e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 23 Apr 2025 11:17:32 +0800 +Subject: [PATCH 1/3] anolis: add nvgpu driver + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 22 ++- + configure.ac | 5 + + ras-nvgpu-driver.c | 444 +++++++++++++++++++++++++++++++++++++++++++++ + ras-nvgpu-nvml.c | 2 - + ras-nvgpu.c | 10 +- + ras-nvgpu.h | 2 + + 6 files changed, 476 insertions(+), 9 deletions(-) + create mode 100644 ras-nvgpu-driver.c + +diff --git a/Makefile.am b/Makefile.am +index 203b576..c400473 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -27,7 +27,9 @@ EXTRA_DIST = \ + contrib/rasdaemon.init \ + contrib/nvml.py \ + contrib/nvml.h \ +- contrib/*_trigger ++ contrib/*_trigger \ ++ libnvgpudriver_x86_64.a \ ++ libnvgpudriver_aarch64.a + + CLEANFILES= \ + ras-nvgpu-nvml.h \ +@@ -148,14 +150,16 @@ if WITH_ERST + endif + + if WITH_NVGPU +- BUILT_SOURCES = ras-nvgpu-nvml.h ++ BUILT_SOURCES = ras-nvgpu-nvml.h libnvgpudriver.a + ras-nvgpu-nvml.h: contrib/nvml.py + python3 $< > $@ ++libnvgpudriver.a: nvgpu_driver ++ cp libnvgpudriver_$(shell uname -m).a $@ + rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) +-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) $(NVGPU_LIBS) ++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) $(NVGPU_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ +@@ -210,3 +214,13 @@ install-data-local: + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \ + fi + $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" ++ ++nvgpu_driver: ++ if [ ! -d "open-gpu-kernel-modules" ]; then git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git -b 570; fi ++ gcc -o ras-nvgpu-driver.o -I./open-gpu-kernel-modules/kernel-open/common/inc \ ++ -I./open-gpu-kernel-modules/kernel-open/nvidia-uvm \ ++ -I./open-gpu-kernel-modules/src/common/sdk/nvidia/inc \ ++ -I./open-gpu-kernel-modules/src/nvidia/arch/nvalloc/unix/include \ ++ $(LIBTRACEEVENT_LIBS) \ ++ -O2 -fPIE -c ras-nvgpu-driver.c ++ ar rcs libnvgpudriver_$(shell uname -m).a ras-nvgpu-driver.o +diff --git a/configure.ac b/configure.ac +index 68fcb75..46ba36e 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -303,10 +303,15 @@ AC_ARG_ENABLE([nvgpu], + AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect") + AC_SUBST([WITH_NVGPU]) ++ NVGPU_LIBS="-lnvgpudriver" ++ NVGPU_CFLAGS="-L." + ]) + AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) + ++AC_SUBST([NVGPU_LIBS]) ++AC_SUBST([NVGPU_CFLAGS]) ++ + AC_ARG_ENABLE([kmsg_monitor], + AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)])) + +diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c +new file mode 100644 +index 0000000..a72a7c5 +--- /dev/null ++++ b/ras-nvgpu-driver.c +@@ -0,0 +1,444 @@ ++ ++#include "nvtypes.h" ++#include ++#include // NV01_DEVICE_0 ++#include // NV20_SUBDEVICE_0 ++ ++#include ++#include // VOLTA_CHANNELChannelGPFifoA ++#include // NV20_SUBDEVICE_0 ++#include // NV20_SUBDEVICE_0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-nvgpu.h" ++#include ++#define NV_PLATFORM_MAX_IOCTL_SIZE 16384 ++#include "nv.h" ++#include "nvos.h" ++#include "nv_escape.h" ++ ++#include "nvstatus.h" ++ ++#define NV_PRINTF_STRING_SECTION ++#undef NV_STATUS_CODE ++#undef SDK_NVSTATUSCODES_H ++#define NV_STATUS_CODE( name, code, string ) static NV_PRINTF_STRING_SECTION \ ++ const char rm_pvt_##name##_str[] = string " [" #name "]"; ++#include "nvstatuscodes.h" ++ ++#undef NV_STATUS_CODE ++#undef SDK_NVSTATUSCODES_H ++#define NV_STATUS_CODE( name, code, string ) [code] = { name, rm_pvt_##name##_str }, ++static struct NvStatusCodeString ++{ ++ NV_STATUS statusCode; ++ const char *statusString; ++} g_StatusCodeList[] = { ++ #include "nvstatuscodes.h" ++}; ++#undef NV_STATUS_CODE ++ ++#include ++ ++#define assert_with_message(condition, message, ...) \ ++ do { \ ++ if (!(condition)) { \ ++ log(ALL, LOG_ERR, "%s Assertion failed: %s: " message "\n", \ ++ __func__, #condition, ##__VA_ARGS__); \ ++ ret = 1; \ ++ } \ ++ } while (0) ++ ++#define nv_assert_ioctl(fd, cmd, p) \ ++ do { \ ++ int r = ioctl(fd, __NV_IOWR(cmd, p), &p); \ ++ assert_with_message(r == 0, "%s", strerror(r)); \ ++ assert_with_message(p.status == 0, "%s", g_StatusCodeList[p.status].statusString); \ ++ } while (0) ++ ++#define error_exit(a, free) \ ++ do { \ ++ a; \ ++ if (ret) goto free; \ ++ } while (0) ++ ++static int ret; ++static void alloc_root(int fd_ctl, NvHandle *root) { ++ NVOS64_PARAMETERS p = { ++ .hClass = NV01_ROOT_CLIENT ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ *root = p.hObjectNew; ++} ++ ++static void free_nvgpu(int fd_ctl, NvHandle root, NvHandle obj, NvHandle old_obj) { ++ NVOS00_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = obj, .hObjectOld = old_obj ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_FREE, p); ++} ++ ++static void alloc_device(int fd_ctl, NvHandle root, NV0080_ALLOC_PARAMETERS *dev, NvHandle *device) { ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = root, .hClass = NV01_DEVICE_0, .pAllocParms = dev, .paramsSize = sizeof(*dev) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ *device = p.hObjectNew; ++} ++ ++static void alloc_subdevice(int fd_ctl, NvHandle root, NvHandle parent, NV2080_ALLOC_PARAMETERS *subdev, NvHandle *subdevice) { ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = parent, .hClass = NV20_SUBDEVICE_0, .pAllocParms = subdev, .paramsSize = sizeof(*subdev) ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ ++ *subdevice = p.hObjectNew; ++} ++ ++static void wait_open(int fd_dev) ++{ ++ nv_ioctl_wait_open_complete_t p = { 0 }; ++ ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_WAIT_OPEN_COMPLETE, p), &p); ++ assert_with_message(ret == 0, "%s", strerror(ret)); ++} ++ ++static void get_pci(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS *pci) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PCI_INFO, .params = pci, .paramsSize = sizeof(*pci) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void attach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_ATTACH_IDS_PARAMS *attach) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_ATTACH_IDS, .params = attach, .paramsSize = sizeof(*attach) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void deattach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_DETACH_IDS_PARAMS *attach) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_DETACH_IDS, .params = attach, .paramsSize = sizeof(*attach) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS *probe) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PROBED_IDS, .params = probe, .paramsSize = sizeof(*probe) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_id_info(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_ID_INFO_PARAMS *info) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_ID_INFO, .params = info, .paramsSize = sizeof(*info) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void register_fd(int fd_dev, int fd_ctl) { ++ nv_ioctl_register_fd_t p = { .ctl_fd = fd_ctl }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_REGISTER_FD, p), &p); ++ assert(ret == 0); ++} ++ ++static void alloc_event(int fd_dev, NvHandle root, NvHandle device, int fd_uvm) { ++ nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_uvm }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p); ++ assert(ret == 0); ++} ++ ++static void free_event(int fd_dev, NvHandle root, NvHandle device) { ++ nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_dev }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p); ++ assert(ret == 0); ++} ++ ++static void event_os_event(int fd_dev, NvHandle root, NvHandle subdevice, int index, NvHandle *event, int fd_uvm) { ++ NV0005_ALLOC_PARAMETERS pp = { .hParentClient = root, .data = (NvP64)fd_uvm, .notifyIndex = index, .hClass = NV01_EVENT_OS_EVENT }; ++ ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = subdevice, .hClass = NV01_EVENT_OS_EVENT, .pAllocParms = &pp, .paramsSize = sizeof(pp) ++ }; ++ ++ nv_assert_ioctl(fd_dev, NV_ESC_RM_ALLOC, p); ++ *event = p.hObjectNew; ++} ++ ++static void set_event(int fd_ctl, NvHandle root, NvHandle subdevice, int index, int type) ++{ ++ NV2080_CTRL_EVENT_SET_NOTIFICATION_PARAMS set = { .event = index, .action = type, .bNotifyState = 0 }; ++ ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = subdevice, .cmd = NV2080_CTRL_CMD_EVENT_SET_NOTIFICATION, .params = &set, .paramsSize = sizeof(set) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_event(NvUnixEvent *event, int fd_dev, int fd_uvm, NvHandle root, NvHandle subdevice, int i) ++{ ++ NVOS41_PARAMETERS p = { .pEvent = event, .MoreEvents = 0 }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_RM_GET_EVENT_DATA, p), &p); ++ assert(ret == 0); ++} ++ ++struct ras_nvgpu_event { ++ NvHandle event; ++ NvV32 index; ++}; ++ ++#define NVGPU_EVENT_NUM 10 ++struct ras_nvgpu_driver { ++ NvHandle device; ++ NvHandle subdevice; ++ NvU32 gpu_id; ++ int fd; ++ NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci; ++ struct ras_nvgpu_event events[NVGPU_EVENT_NUM]; ++}; ++ ++static int event_index[NVGPU_EVENT_NUM] = { ++ NV2080_NOTIFIERS_RC_ERROR, ++ NV2080_NOTIFIERS_ECC_DBE, ++ NV2080_NOTIFIERS_NVLINK_ERROR_FATAL, ++ NV2080_NOTIFIERS_NVLINK_ERROR_RECOVERY_REQUIRED, ++ NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL, ++ NV2080_NOTIFIERS_POISON_ERROR_FATAL, ++ NV2080_NOTIFIERS_NVLINK_INFO_LINK_DOWN, ++ NV2080_NOTIFIERS_ECC_SBE_STORM, ++ NV2080_NOTIFIERS_NVLINK_UNCONTAINED_ERROR, ++ NV2080_NOTIFIERS_GPU_UNAVAILABLE ++}; ++ ++static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *event) ++{ ++ struct trace_seq s; ++ time_t now; ++ struct tm *tm; ++ char timestamp[64]; ++ ++ time(&now); ++ tm = localtime(&now); ++ ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ trace_seq_init(&s); ++ if (event->NotifyIndex == NV2080_NOTIFIERS_RC_ERROR) { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); ++ trace_seq_printf(&s, "xid: %d ", event->info32); ++ trace_seq_printf(&s, "data1: %d ", event->info16); ++ } else { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); ++ trace_seq_printf(&s, "event_type: %d ", event->NotifyIndex); ++ trace_seq_printf(&s, "data: %d ", event->info32); ++ trace_seq_printf(&s, "data1: %d ", event->info16); ++ ++ } ++ ++ trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++ ++ return 0; ++} ++ ++int ras_nvgpu_driver_handle(void) { ++ int fd_ctl = 0, fd_uvm = 0, i, gpu_count = 0; ++ NvHandle root = 0; ++ struct pollfd *pfd; ++ ++ fd_uvm = open("/dev/nvidia-uvm", O_RDWR | O_CLOEXEC); ++ if (fd_ctl < 0) { ++ perror("open"); ++ return 1; ++ } ++ ++ fd_ctl = open("/dev/nvidiactl", O_RDWR | O_CLOEXEC); ++ if (fd_ctl < 0) { ++ perror("open"); ++ ret = 1; ++ goto close_uvm; ++ } ++ ++ error_exit(alloc_root(fd_ctl, &root), close); ++ ++ NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS id = {0}; ++ NV0000_CTRL_GPU_ATTACH_IDS_PARAMS attach = {0}; ++ NV0000_CTRL_GPU_DETACH_IDS_PARAMS detach = {0}; ++ error_exit(get_id(fd_ctl, root, &id), free_root); ++ ++ for (i = 0; i < NV0000_CTRL_GPU_MAX_PROBED_GPUS; i++) { ++ if (id.gpuIds[i] == NV0000_CTRL_GPU_INVALID_ID) ++ break; ++ ++ attach.gpuIds[i] = id.gpuIds[i]; ++ detach.gpuIds[i] = id.gpuIds[i]; ++ } ++ gpu_count = i; ++ attach.gpuIds[i] = NV0000_CTRL_GPU_INVALID_ID; ++ ++ error_exit(attach_id(fd_ctl, root, &attach), free_root); ++ ++ struct ras_nvgpu_driver *nvgpus = calloc(gpu_count, sizeof(struct ras_nvgpu_driver)); ++ if (!nvgpus) { ++ log(ALL, LOG_ERR, "nvgpu alloc error\n"); ++ ret = 1; ++ goto detach; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ char path[32]; ++ struct ras_nvgpu_driver *nvgpu = &nvgpus[i]; ++ NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci = {0}; ++ NV0000_CTRL_GPU_GET_ID_INFO_PARAMS info = {0}; ++ NV0080_ALLOC_PARAMETERS dev = { 0 }; ++ NV2080_ALLOC_PARAMETERS subdev = { 0 }; ++ NvU32 gpu_id = id.gpuIds[i]; ++ int fd; ++ ++ nvgpu->gpu_id = gpu_id; ++ snprintf(path, 32, "/dev/nvidia%d", i); ++ nvgpu->fd = open(path, O_RDWR | O_CLOEXEC); ++ if (nvgpu->fd < 0) { ++ log(ALL, LOG_ERR, "nvgpu open error\n"); ++ goto free_nvgpu; ++ } ++ fd = nvgpu->fd; ++ ++ error_exit(wait_open(fd), free_nvgpu); ++ ++ pci.gpuId = gpu_id; ++ error_exit(get_pci(fd_ctl, root, &pci), free_nvgpu); ++ nvgpu->pci = pci; ++ ++ info.gpuId = id.gpuIds[i]; ++ error_exit(get_id_info(fd_ctl, root, &info), free_nvgpu); ++ ++ error_exit(register_fd(fd, fd_ctl), free_nvgpu); ++ ++ dev.deviceId = info.deviceInstance; ++ error_exit(alloc_device(fd_ctl, root, &dev, &nvgpu->device), free_nvgpu); ++ ++ subdev.subDeviceId = info.subDeviceInstance; ++ error_exit(alloc_subdevice(fd_ctl, root, nvgpu->device, &subdev, &nvgpu->subdevice), free_nvgpu); ++ ++ error_exit(alloc_event(fd, root, nvgpu->device, fd_uvm), free_nvgpu); ++ ++ for (int j = 0; j < NVGPU_EVENT_NUM; j++) { ++ struct ras_nvgpu_event *event = &nvgpu->events[j]; ++ event->index = event_index[j]; ++ ++ event_os_event(fd, root, nvgpu->subdevice, event->index, &event->event, fd_uvm); ++ if (ret) { ++ log(ALL, LOG_ERR, "nvgpu event %d register error\n", event->index); ++ ret = 0; ++ continue; ++ } ++ set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_REPEAT); ++ if (ret) { ++ log(ALL, LOG_ERR, "nvgpu event %d set error\n", event->index); ++ free_nvgpu(fd_ctl, root, nvgpu->subdevice, event->event); ++ ret = 0; ++ continue; ++ } ++ } ++ log(ALL, LOG_INFO, "GPU %d: %04x:%02x:%02x.0 found, deviceid %d subdeviceid %d\n", ++ nvgpu->gpu_id, nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot, info.deviceInstance, info.subDeviceInstance); ++ } ++ ++ pfd = malloc(sizeof(struct pollfd) * gpu_count); ++ if (!pfd) { ++ log(ALL, LOG_ERR, "nvgpu alloc error\n"); ++ ret = 1; ++ goto free_nvgpu; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ pfd[i].fd = nvgpus[i].fd; ++ pfd[i].events = POLLIN | POLLPRI; ++ } ++ ++ while (1) { ++ if (poll(pfd, gpu_count, -1) < 0) { ++ log(ALL, LOG_ERR, "nvgpu poll error\n"); ++ goto free_pfd; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ if (pfd[i].revents & POLLIN) { ++ NvUnixEvent event; ++ ++ get_event(&event, nvgpus[i].fd, fd_uvm, root, nvgpus[i].subdevice, 25); ++ ++ report_ras_nvgpu_driver(&nvgpus[i], &event); ++ } ++ } ++ } ++ ++free_pfd: ++ free(pfd); ++free_nvgpu: ++ for (i = 0; i < gpu_count; i++) { ++ struct ras_nvgpu_driver *nvgpu = &nvgpus[i]; ++ ++ for (int j = 0; j < NVGPU_EVENT_NUM; j++) { ++ struct ras_nvgpu_event *event = &nvgpu->events[j]; ++ ++ if (event->event) { ++ set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_DISABLE); ++ free_nvgpu(fd_ctl, root, nvgpus->subdevice, event->event); ++ } ++ } ++ free_event(nvgpu->fd, root, nvgpu->device); ++ if (nvgpu->subdevice) ++ free_nvgpu(fd_ctl, root, nvgpu->device, nvgpu->subdevice); ++ if (nvgpu->device) ++ free_nvgpu(fd_ctl, root, nvgpu->device, 0); ++ if (nvgpu->device) ++ free_nvgpu(fd_ctl, root, root, nvgpu->device); ++ if (nvgpu->fd) ++ close(nvgpu->fd); ++ } ++detach: ++ deattach_id(fd_ctl, root, &detach); ++free_root: ++ free_nvgpu(fd_ctl, root, root, root); ++close: ++ close(fd_ctl); ++close_uvm: ++ close(fd_uvm); ++ ++ return ret; ++} +\ No newline at end of file +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 2758d14..541ff69 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -14,8 +14,6 @@ + #include "trace-seq.h" + #include "types.h" + +-#define XID_EVENT_NAME "xid" +- + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", + "/lib64/libnvidia-ml.so.1", +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 5c63279..4d39de2 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -43,12 +43,16 @@ void *ras_nvgpu_handle(void *arg) + + while (retry--) { + if (ras_nvgpu_nvml_handle()) { +- log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry); +- sleep(10); ++ log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); ++ sleep(1); + } + } + +- log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n"); ++ log(ALL, LOG_ERR, "NVGPU nvml handle fail, try for nvgpu driver call\n"); ++ ++ ras_nvgpu_driver_handle(); ++ ++ log(ALL, LOG_ERR, "NVGPU driver handle fail, exit nvgpu thread\n"); + + return NULL; + } +diff --git a/ras-nvgpu.h b/ras-nvgpu.h +index 32827ad..bade7e4 100644 +--- a/ras-nvgpu.h ++++ b/ras-nvgpu.h +@@ -8,7 +8,9 @@ + #define __RAS_NVGPU_H + + #define NVGPU_EVENT_NAME "nvgpu" ++#define XID_EVENT_NAME "xid" + + void *ras_nvgpu_handle(void *arg); + int ras_nvgpu_nvml_handle(void); ++int ras_nvgpu_driver_handle(void); + #endif +-- +2.43.5 + diff --git a/1028-anolis-add-trigger-for-nvgpu-event.patch b/1028-anolis-add-trigger-for-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..8c17b64c66c7f3967258bd89889656f775b6a98f --- /dev/null +++ b/1028-anolis-add-trigger-for-nvgpu-event.patch @@ -0,0 +1,241 @@ +From 67fcdb9008b17555b0ea0d4c791f3ac772ee682c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 10:20:16 +0800 +Subject: [PATCH 2/3] anolis: add trigger for nvgpu event + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_trigger | 25 +++++++++++++++++++++++++ + misc/rasdaemon.env | 3 +++ + ras-nvgpu-driver.c | 7 ++++++- + ras-nvgpu-nvml.c | 8 +++++++- + ras-nvgpu.c | 3 +++ + trigger.c | 35 +++++++++++++++++++++++++++++++++++ + trigger.h | 1 + + 7 files changed, 80 insertions(+), 2 deletions(-) + create mode 100755 contrib/nvgpu_trigger + +diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger +new file mode 100755 +index 0000000..48955af +--- /dev/null ++++ b/contrib/nvgpu_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local ++ ++if [ -d nvgpu_trigger.extern ] ++then ++ ls nvgpu_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 198b050..b08afa6 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + KMSG_TRIGGER= + KMSG_TRIGGER_TIMEOUT=0 + ++NVGPU_TRIGGER= ++NVGPU_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c +index a72a7c5..9093292 100644 +--- a/ras-nvgpu-driver.c ++++ b/ras-nvgpu-driver.c +@@ -24,6 +24,7 @@ + + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + #include + #define NV_PLATFORM_MAX_IOCTL_SIZE 16384 + #include "nv.h" +@@ -238,6 +239,7 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -263,7 +265,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + + } + +- trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ snprintf(tmpbuf, sizeof(tmpbuf), "%08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + + trace_seq_terminate(&s); + trace_seq_do_printf(&s); +@@ -271,6 +274,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, event->NotifyIndex, event->info32, event->info16); ++ + return 0; + } + +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 541ff69..f2421a1 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2025 Alibaba Inc + */ + ++#include + #include + #include + #include +@@ -13,6 +14,7 @@ + #include "ras-nvgpu.h" + #include "trace-seq.h" + #include "types.h" ++#include "trigger.h" + + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", +@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + trace_seq_printf(&s, "data: %lld ", data->eventData); + } + +- trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); + trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); + +@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0); ++ + return 0; + } + +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 4d39de2..37a8833 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -15,6 +15,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + void *ras_nvgpu_handle(void *arg) + { + (void)arg; +@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg) + return NULL; + } + ++ setup_event_trigger("nvgpu_event"); ++ + while (retry--) { + if (ras_nvgpu_nvml_handle()) { + log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); +diff --git a/trigger.c b/trigger.c +index d410137..e113077 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF + + struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; + ++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_KMSG_MONITOR + &kmsg_trigger, + #endif ++#ifdef HAVE_NVGPU ++ &nvgpu_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -476,3 +481,33 @@ free: + free(env[i]); + } + ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ struct event_trigger *trigger = &nvgpu_trigger; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA1=%d", data1) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA2=%d", data1) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (int i = 0; i < ei; i++) ++ free(env[i]); ++} ++ +diff --git a/trigger.h b/trigger.h +index b5a6c2c..2ea2b09 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); + void run_page_offline_trigger(unsigned long long addr, int otype, int type); + void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2); + + + #endif +-- +2.43.5 + diff --git a/1029-anolis-add-nvgpu-reset-trigger.patch b/1029-anolis-add-nvgpu-reset-trigger.patch new file mode 100644 index 0000000000000000000000000000000000000000..3cb9e5b02f914e82a05a317a35c5ee4a82b3f171 --- /dev/null +++ b/1029-anolis-add-nvgpu-reset-trigger.patch @@ -0,0 +1,76 @@ +From 866c8169c9376f7c0b8a23966caaf099ebbeee9e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 14:11:30 +0800 +Subject: [PATCH 3/3] anolis: add nvgpu reset trigger + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_reset_trigger | 40 +++++++++++++++++++++++++++++++++++++ + contrib/rasdaemon.init | 4 ++++ + 2 files changed, 44 insertions(+) + create mode 100755 contrib/nvgpu_reset_trigger + +diff --git a/contrib/nvgpu_reset_trigger b/contrib/nvgpu_reset_trigger +new file mode 100755 +index 0000000..769e5e2 +--- /dev/null ++++ b/contrib/nvgpu_reset_trigger +@@ -0,0 +1,40 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_reset_trigger.local ] && . ./nvgpu_reset_trigger.local ++ ++if [ -d nvgpu_reset_trigger.extern ] ++then ++ ls nvgpu_reset_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_reset_trigger.extern/$item ] && . ./nvgpu_reset_trigger.extern/$item ++ done ++fi ++ ++if [ "$EVENT_TYPE" == "8" ] && [ "$DATA1" == "48" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++if [ "$EVENT_TYPE" == "2" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++if [ "$EVENT_TYPE" == "37" ] && [ "$DATA1" == "48" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++exit 0 +diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init +index d575af9..5fde6c8 100644 +--- a/contrib/rasdaemon.init ++++ b/contrib/rasdaemon.init +@@ -13,6 +13,10 @@ case "$target" in + sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH} + sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH} + ;; ++ nvgpu_reset) ++ sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH} ++ sed -i 's/^NVGPU_TRIGGER=.*/NVGPU_TRIGGER="nvgpu_reset_trigger"/g' ${ENV_PATH} ++ ;; + jituan) + sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH} + sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH} +-- +2.43.5 + diff --git a/1029-anolis-add-trigger-for-nvgpu-event.patch b/1029-anolis-add-trigger-for-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..7e1c34b13c48eddea6ac10b0e11263c1c6f36ed7 --- /dev/null +++ b/1029-anolis-add-trigger-for-nvgpu-event.patch @@ -0,0 +1,201 @@ +From 03cd59d6aafbd14ed29ce2f9a73d0bbd8f8b23d3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 10:20:16 +0800 +Subject: [PATCH 29/30] anolis: add trigger for nvgpu event + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_trigger | 25 +++++++++++++++++++++++++ + misc/rasdaemon.env | 3 +++ + ras-nvgpu-nvml.c | 8 +++++++- + ras-nvgpu.c | 3 +++ + trigger.c | 35 +++++++++++++++++++++++++++++++++++ + trigger.h | 1 + + 9 files changed, 80 insertions(+), 2 deletions(-) + create mode 100755 contrib/nvgpu_trigger + +diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger +new file mode 100755 +index 0000000..48955af +--- /dev/null ++++ b/contrib/nvgpu_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local ++ ++if [ -d nvgpu_trigger.extern ] ++then ++ ls nvgpu_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 198b050..b08afa6 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + KMSG_TRIGGER= + KMSG_TRIGGER_TIMEOUT=0 + ++NVGPU_TRIGGER= ++NVGPU_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 541ff69..f2421a1 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2025 Alibaba Inc + */ + ++#include + #include + #include + #include +@@ -13,6 +14,7 @@ + #include "ras-nvgpu.h" + #include "trace-seq.h" + #include "types.h" ++#include "trigger.h" + + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", +@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + trace_seq_printf(&s, "data: %lld ", data->eventData); + } + +- trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); + trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); + +@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0); ++ + return 0; + } + +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 4d39de2..37a8833 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -15,6 +15,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + void *ras_nvgpu_handle(void *arg) + { + (void)arg; +@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg) + return NULL; + } + ++ setup_event_trigger("nvgpu_event"); ++ + while (retry--) { + if (ras_nvgpu_nvml_handle()) { + log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); +diff --git a/trigger.c b/trigger.c +index d410137..e113077 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF + + struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; + ++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_KMSG_MONITOR + &kmsg_trigger, + #endif ++#ifdef HAVE_NVGPU ++ &nvgpu_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -476,3 +481,33 @@ free: + free(env[i]); + } + ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ struct event_trigger *trigger = &nvgpu_trigger; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA1=%d", data1) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA2=%d", data1) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (int i = 0; i < ei; i++) ++ free(env[i]); ++} ++ +diff --git a/trigger.h b/trigger.h +index b5a6c2c..2ea2b09 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); + void run_page_offline_trigger(unsigned long long addr, int otype, int type); + void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2); + + + #endif +-- +2.43.5 + diff --git a/1030-fix-build-error-of-some-variable-undefine.patch b/1030-fix-build-error-of-some-variable-undefine.patch new file mode 100644 index 0000000000000000000000000000000000000000..c13f203c1524f81e34a219a89a81d70373e2c4c6 --- /dev/null +++ b/1030-fix-build-error-of-some-variable-undefine.patch @@ -0,0 +1,27 @@ +From e2c1a3ce09f74e6de2ea8bb710b51babf7645376 Mon Sep 17 00:00:00 2001 +From: happy_orange +Date: Fri, 6 Jun 2025 14:42:13 +0800 +Subject: [PATCH 1/1] fix build error of some variable undefine + +--- + ras-pcie-edpc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/ras-pcie-edpc.c b/ras-pcie-edpc.c +index 4731b05..53d93ed 100644 +--- a/ras-pcie-edpc.c ++++ b/ras-pcie-edpc.c +@@ -41,8 +41,8 @@ static bool is_cxl_mem_or_cache(struct pci_dev *dev) + if (vendor != PCI_DVSEC_VENDOR_ID_CXL || id != PCI_DVSEC_ID_CXL) + return false; + +- cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_CAP); +- if (cxl_cap & (PCI_CXL_CAP_CACHE | PCI_CXL_CAP_MEM)) ++ cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_DEV_CAP); ++ if (cxl_cap & (PCI_CXL_DEV_CAP_CACHE | PCI_CXL_DEV_CAP_MEM)) + return true; + + return false; +-- +2.43.5 + diff --git a/dist b/dist new file mode 100644 index 0000000000000000000000000000000000000000..ffd87663ad49340de9f1a1f342206406d2ba1712 --- /dev/null +++ b/dist @@ -0,0 +1 @@ +an23 diff --git a/rasdaemon-0.8.0.tar.bz2 b/rasdaemon-0.8.0.tar.bz2 deleted file mode 100644 index 8837c6ab83ad422135d477bf4e3d36d88a8eb1ce..0000000000000000000000000000000000000000 Binary files a/rasdaemon-0.8.0.tar.bz2 and /dev/null differ diff --git a/rasdaemon-0.8.3.tar.bz2 b/rasdaemon-0.8.3.tar.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..bc6bb41df46c10f3f3b80d222fd311a445e1884c Binary files /dev/null and b/rasdaemon-0.8.3.tar.bz2 differ diff --git a/rasdaemon.spec b/rasdaemon.spec index 0a0057f8f1529cb4507fd777673eec54de1495e7..a574639cfbee32d6f579329e2d9119fb98d049f2 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,22 +1,71 @@ -%define anolis_release 1 - +%define anolis_release 2 Name: rasdaemon -Version: 0.8.0 +Version: 0.8.3 Release: %{anolis_release}%{?dist} Summary: Utility to receive RAS error tracings -Group: Applications/System -License: GPLv2 +License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 +Patch1001: 1001-config-add-syslog-ng-and-logrotate-config.patch +Patch1002: 1002-config-add-rsyslog-config.patch +Patch1003: 1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch +Patch1004: 1004-rasdaemon-align-event-name-in-log.patch +Patch1005: 1005-rasdaemon-skip-doesn-t-exist-event.patch +Patch1006: 1006-rasdaemon-support-memory-corrected-error-statistics.patch +Patch1007: 1007-rasdaemon-introduce-poison-page-statistics.patch +Patch1008: 1008-rasdaemon-erst-decode-panic-mce-through-erst.patch +Patch1009: 1009-aer-print-pci-device-name-and-vendor-device-id.patch +Patch1010: 1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch +Patch1011: 1011-rasdaemon-support-nvgpu-event.patch +Patch1012: 1012-rasdaemon-enhance-rasdaemon-event-trigger.patch +Patch1013: 1013-rasdaemon-add-event-level-for-event-record.patch +Patch1014: 1014-anolis-syslog-add-rasdaemon.ext.patch +Patch1015: 1015-rasdaemon-add-page-offline-trigger.patch +Patch1016: 1016-anolis-compta-rasdaemon-notices.patch +Patch1017: 1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch +Patch1018: 1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch +Patch1019: 1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch +Patch1020: 1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch +Patch1021: 1021-anolis-config-disable-page-offline-defalut.patch +Patch1022: 1022-anolis-disable-block-and-dev-error-default.patch +Patch1023: 1023-anolis-add-nvml-in-tree.patch +Patch1024: 1024-anolis-do-not-print-teq-error.patch +Patch1025: 1025-anolis-add-init.sh-for-different-user.patch +Patch1026: 1026-anolis-fix-systemd-config.patch +Patch1027: 1027-anolis-add-nvgpu-driver.patch +Patch1028: 1028-anolis-add-trigger-for-nvgpu-event.patch +Patch1029: 1029-anolis-add-nvgpu-reset-trigger.patch +Patch1030: 1030-fix-build-error-of-some-variable-undefine.patch -BuildRequires: make gcc autoconf automake libtool perl-generators -BuildRequires: gettext-devel sqlite-devel libtraceevent-devel systemd-rpm-macros +ExcludeArch: s390 s390x +BuildRequires: make +BuildRequires: gcc +BuildRequires: gettext-devel +BuildRequires: perl-generators +BuildRequires: sqlite-devel +BuildRequires: systemd +BuildRequires: autoconf +BuildRequires: automake +BuildRequires: libtool +BuildRequires: libtraceevent-devel +BuildRequires: pciutils-devel +BuildRequires: zlib-devel +BuildRequires: python3 +BuildRequires: rasdaemon-open-gpu-kernel-modules Provides: bundled(kernel-event-lib) -Requires: hwdata perl-DBD-SQLite libtraceevent -%ifarch x86_64 +Requires: hwdata +Requires: perl-DBD-SQLite +Requires: libtraceevent +Requires: pciutils +Requires: zlib +%ifarch %{ix86} x86_64 Requires: dmidecode %endif +Requires(post): systemd +Requires(preun): systemd +Requires(postun): systemd + %description %{name} is a RAS (Reliability, Availability and Serviceability) logging tool. It currently records memory errors, using the EDAC tracing events. @@ -37,45 +86,147 @@ Doc files for %{name} %prep %setup -q -autoreconf -vfi +tar -xf /usr/share/rasdaemon-open-gpu-kernel-modules/*.tar.gz +mv open-gpu-kernel-modules-* open-gpu-kernel-modules +%patch1001 -p1 +%patch1002 -p1 +%patch1003 -p1 +%patch1004 -p1 +%patch1005 -p1 +%patch1006 -p1 +%patch1007 -p1 +%patch1008 -p1 +%patch1009 -p1 +%patch1010 -p1 +%patch1011 -p1 +%patch1012 -p1 +%patch1013 -p1 +%patch1014 -p1 +%patch1015 -p1 +%patch1016 -p1 +%patch1017 -p1 +%patch1018 -p1 +%patch1019 -p1 +%patch1020 -p1 +%patch1021 -p1 +%patch1022 -p1 +%patch1023 -p1 +%patch1024 -p1 +%patch1025 -p1 +%patch1026 -p1 +%patch1027 -p1 +%patch1028 -p1 +%patch1029 -p1 +%patch1030 -p1 + %build -%ifarch aarch64 -%configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm \ - --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ - --enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \ - --enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \ - --with-sysconfdefdir=%{_sysconfdir}/sysconfig +%ifarch loongarch64 +%configure --enable-sqlite3 --enable-aer --enable-non-standard \ + --enable-devlink --enable-diskerror \ + --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \ + --enable-cxl --enable-json-report --enable-memory-ce-pfa --enable-memory-row-ce-pfa \ + --enable-signal --enable-erst --enable-kmsg-monitor \ + --with-sysconfdefdir=%{_sysconfdir}/sysconfig %else -%configure --enable-sqlite3 --enable-aer \ - --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ - --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \ - --with-sysconfdefdir=%{_sysconfdir}/sysconfig +%configure --enable-all --with-sysconfdefdir=%{_sysconfdir}/sysconfig %endif -%make_build +make %{?_smp_mflags} %install make install DESTDIR=%{buildroot} -install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service +install -D -p -m 0644 misc/%{name}.service %{buildroot}%{_unitdir}/%{name}.service install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service -install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} -rm %{buildroot}/usr/include/*.h -%generate_compatibility_deps +install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} +install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng +install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate +install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog +install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext +install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext +install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ +install -D -p -m 0755 contrib/%{name}.init %{buildroot}/usr/share/%{name}/%{name}.init +rm INSTALL %{buildroot}/usr/include/*.h %files -%license COPYING -%{_sbindir}/rasdaemon +%doc AUTHORS ChangeLog COPYING TODO +%{_sbindir}/%{name} %{_sbindir}/ras-mc-ctl %{_mandir}/*/* %{_unitdir}/*.service %{_sysconfdir}/ras/dimm_labels.d -%config(noreplace) %{_sysconfdir}/sysconfig/%{name} -%dir %{abidir} -%{abidir}/rasdaemon-option.list +%{_sysconfdir}/ras/*/* +%{_sysconfdir}/sysconfig/%{name} +/usr/share/%{name}/%{name}.syslog-ng +/usr/share/%{name}/%{name}.logrotate +/usr/share/%{name}/%{name}.rsyslog +/usr/share/%{name}/%{name}.syslog-ng-ext +/usr/share/%{name}/%{name}.rsyslog-ext +/usr/share/%{name}/%{name}.init +%{_sysconfdir}/rasdaemon_notices/* + +%post +if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng-ext %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; +fi +if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + ln -s /usr/share/%{name}/%{name}.rsyslog-ext %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; +fi +if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; + ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name}; +fi +if ! systemctl is-enabled --quiet %{name}.service; then + echo "Rasdaemon service is not enabled, enable it"; + systemctl enable %{name}.service; +fi +echo "Rasdaemon install for ${RASDAEMON_TARGET}"; +/usr/share/%{name}/%{name}.init ${RASDAEMON_TARGET} + +systemctl daemon-reload +systemctl restart %{name}.service + +%preun +systemctl stop %{name}.service +systemctl disable %{name}.service + +%postun +if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog-ng service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; +fi +if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; +fi +if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; +fi %files doc -%doc AUTHORS ChangeLog README.md TODO INSTALL +%doc AUTHORS ChangeLog README.md TODO %changelog +* Thu Mar 20 2025 wangzhe - 0.8.3-2 +- update to 0.8.3 +- support mc event stat +- support poison stat +- support log level +- support nvgpu event + * Fri Apr 07 2023 Chunmei Xu - 0.8.0-1 - init from upstream