diff --git a/1001-config-add-syslog-ng-and-logrotate-config.patch b/1001-config-add-syslog-ng-and-logrotate-config.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b540055656d82164142d04775163ec55859d4f8c
--- /dev/null
+++ b/1001-config-add-syslog-ng-and-logrotate-config.patch
@@ -0,0 +1,203 @@
+From 6949adcd0e7595000b882d57ebc7e3f47c40508e Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 18 Mar 2025 16:24:56 +0800
+Subject: [PATCH 01/30] config: add syslog-ng and logrotate config
+
+redirect all rasdaemon log to /var/log/rasdaemon and config logrotate,
+add related modification in rasdaemon.spec
+
+The patch does not directly add a dependency on the syslog-ng package
+to the rasdaemon RPM. Instead, it dynamically checks whether the
+syslog-ng service is running during installation and configures accordingly.
+
+Signed-off-by: Bing Wu <jiyu.wb@alibaba-inc.com>
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                 | 31 +++++++++++++++++++++-----
+ man/rasdaemon.1.in          |  3 ++-
+ misc/rasdaemon.logrotate.in | 14 ++++++++++++
+ misc/rasdaemon.spec.in      | 43 +++++++++++++++++++++++++++++++++----
+ misc/rasdaemon.syslog-ng.in |  7 ++++++
+ 6 files changed, 90 insertions(+), 10 deletions(-)
+ create mode 100644 misc/rasdaemon.logrotate.in
+ create mode 100644 misc/rasdaemon.syslog-ng.in
+
+diff --git a/Makefile.am b/Makefile.am
+index 01132fe..a1f6edf 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -5,27 +5,42 @@ ACLOCAL_AMFLAGS=-I m4
+ SUBDIRS = util man
+ SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in
+ SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service)
++SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in
++SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng)
++LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in
++LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate)
+ EXTRA_DIST = \
+-	$(SYSTEMD_SERVICES_IN) misc/rasdaemon.env \
++	$(SYSTEMD_SERVICES_IN) \
++	$(SYSLOG_SERVICES_IN) \
++	$(LOGROTATE_SERVICES_IN) \
++	misc/rasdaemon.env \
+ 	contrib/mc_event_trigger \
+ 	contrib/mem_fail_trigger
+ 
+ CLEANFILES= \
+ 	misc/ras-mc-ctl.service	\
+-	misc/rasdaemon.service
++	misc/rasdaemon.service \
++	misc/rasdaemon.syslog-ng \
++	misc/rasdaemon.logrotate
+ 
+ DISTCLEANFILES = misc/rasdaemon.spec
+ 
+ # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin
+ # during ./configure phase, therefore it is not possible to add .service.in
+ # files to AC_CONFIG_FILES in configure.ac
+-SUFFIXES = .service.in .service
++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng
+ .service.in.service:
+ 	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@
+ 
++.logrotate.in.logrotate:
++	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
++
++.syslog-ng.in.syslog-ng:
++	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
++
+ # This rule is needed because the service files must be generated on target
+ # system after ./configure phase
+-all-local: $(SYSTEMD_SERVICES)
++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES)
+ 
+ sbin_PROGRAMS = rasdaemon
+ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
+@@ -128,6 +143,12 @@ upload:
+ install-data-local:
+ 	$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
+ 	$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
+-	$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
++	install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
+ 	$(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
+ 	$(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger"
++	if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \
++		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \
++	fi
++	if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \
++		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \
++	fi
+diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in
+index 7cfef54..e884e55 100644
+--- a/man/rasdaemon.1.in
++++ b/man/rasdaemon.1.in
+@@ -34,7 +34,8 @@ rasdaemon \- RAS daemon to log the RAS events.
+ The \fBrasdaemon\fR program is a daemon which monitors the platform
+ Reliablity, Availability and Serviceability (RAS) reports from the
+ Linux kernel trace events. These trace events are logged in
+-/sys/kernel/debug/tracing, reporting them via syslog/journald.
++/sys/kernel/debug/tracing, reporting them via syslog/journald. If
++syslog-ng is installed, the events will logged at @localstatedir@/log/rasdaemon.
+ 
+ .SH OPTIONS
+ .TP
+diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in
+new file mode 100644
+index 0000000..b7b62fe
+--- /dev/null
++++ b/misc/rasdaemon.logrotate.in
+@@ -0,0 +1,14 @@
++@localstatedir@/log/rasdaemon {
++	compress
++	monthly
++	size 100M
++	dateext
++	rotate 4
++	notifempty
++	missingok
++	copytruncate
++	sharedscripts
++	postrotate
++		@sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true
++	endscript
++}
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index 32c69b7..8ab3d50 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -49,20 +49,55 @@ make %{?_smp_mflags}
+ 
+ %install
+ make install DESTDIR=%{buildroot}
+-install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service
++install -D -p -m 0644 misc/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
+ install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
+-install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
++install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
++install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng
++install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate
+ rm INSTALL %{buildroot}/usr/include/*.h
+ 
+ %files
+-%doc AUTHORS ChangeLog COPYING README.md TODO
+-%{_sbindir}/rasdaemon
++%doc AUTHORS ChangeLog COPYING TODO
++%{_sbindir}/%{name}
+ %{_sbindir}/ras-mc-ctl
+ %{_mandir}/*/*
+ %{_unitdir}/*.service
+ %{_sysconfdir}/ras/dimm_labels.d
+ %{_sysconfdir}/ras/*/*
+ %config(noreplace) %{_sysconfdir}/sysconfig/%{name}
++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng
++%config(noreplace) /usr/share/%{name}/%{name}.logrotate
++
++%post
++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then
++    echo "Syslog service is enabled and running, create config file and restart it";
++    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
++    ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
++    systemctl restart syslog-ng.service;
++fi
++if [ -d "%{_sysconfdir}/logrotate.d" ]; then
++    rm -rf %{_sysconfdir}/logrotate.d/%{name};
++    ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name};
++fi
++if ! systemctl is-enabled --quiet %{name}.service; then
++    echo "Rasdaemon service is not enabled, enable it";
++    systemctl enable %{name}.service;
++fi
++systemctl restart %{name}.service
++
++%preun
++systemctl stop %{name}.service
++systemctl disable %{name}.service
++
++%postun
++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then
++    echo "Syslog service is enabled and running, delete config file and restart it";
++    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
++    systemctl restart syslog-ng.service;
++fi
++if [ -d "%{_sysconfdir}/logrotate.d" ]; then
++    rm -rf %{_sysconfdir}/logrotate.d/%{name};
++fi
+ 
+ %changelog
+ 
+diff --git a/misc/rasdaemon.syslog-ng.in b/misc/rasdaemon.syslog-ng.in
+new file mode 100644
+index 0000000..b3308f8
+--- /dev/null
++++ b/misc/rasdaemon.syslog-ng.in
+@@ -0,0 +1,7 @@
++# SPDX-License-Identifier: GPL-2.0
++
++destination d_rasdaemon		{ file("@localstatedir@/log/rasdaemon" persist-name(rasdaemon-syslog)); };
++
++filter f_rasdaemon		{ program("rasdaemon"); };
++
++log { source(s_sys); filter(f_rasdaemon); destination(d_rasdaemon); };
+-- 
+2.43.5
+
diff --git a/1002-config-add-rsyslog-config.patch b/1002-config-add-rsyslog-config.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8ad5777c21a1c2b7436c8e7ff6357027c9d7fddf
--- /dev/null
+++ b/1002-config-add-rsyslog-config.patch
@@ -0,0 +1,160 @@
+From f1f27e8f90a0be341e367a40962f6f7103504659 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 15 Apr 2025 11:18:02 +0800
+Subject: [PATCH 02/30] config: add rsyslog config
+
+redirect all rasdaemon log to /var/log/rasdaemon,
+add related modification in rasdaemon.spec
+
+The patch does not directly add a dependency on the rsyslog package
+to the rasdaemon RPM. Instead, it dynamically checks whether the
+rsyslog service is running during installation and configures accordingly.
+
+Signed-off-by: Bing Wu <jiyu.wb@alibaba-inc.com>
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                 | 14 ++++++++++++--
+ misc/rasdaemon.logrotate.in |  3 ++-
+ misc/rasdaemon.rsyslog.in   |  3 +++
+ misc/rasdaemon.spec.in      | 19 ++++++++++++++++---
+ 5 files changed, 34 insertions(+), 6 deletions(-)
+ create mode 100644 misc/rasdaemon.rsyslog.in
+
+diff --git a/Makefile.am b/Makefile.am
+index a1f6edf..e3e66bb 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -9,9 +9,12 @@ SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in
+ SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng)
+ LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in
+ LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate)
++RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in
++RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog)
+ EXTRA_DIST = \
+ 	$(SYSTEMD_SERVICES_IN) \
+ 	$(SYSLOG_SERVICES_IN) \
++	$(RSYSLOG_SERVICES_IN) \
+ 	$(LOGROTATE_SERVICES_IN) \
+ 	misc/rasdaemon.env \
+ 	contrib/mc_event_trigger \
+@@ -21,6 +24,7 @@ CLEANFILES= \
+ 	misc/ras-mc-ctl.service	\
+ 	misc/rasdaemon.service \
+ 	misc/rasdaemon.syslog-ng \
++	misc/rasdaemon.rsyslog \
+ 	misc/rasdaemon.logrotate
+ 
+ DISTCLEANFILES = misc/rasdaemon.spec
+@@ -28,7 +32,7 @@ DISTCLEANFILES = misc/rasdaemon.spec
+ # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin
+ # during ./configure phase, therefore it is not possible to add .service.in
+ # files to AC_CONFIG_FILES in configure.ac
+-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng
++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog
+ .service.in.service:
+ 	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@
+ 
+@@ -38,9 +42,12 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n
+ .syslog-ng.in.syslog-ng:
+ 	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
+ 
++.rsyslog.in.rsyslog:
++	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
++
+ # This rule is needed because the service files must be generated on target
+ # system after ./configure phase
+-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES)
++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES)
+ 
+ sbin_PROGRAMS = rasdaemon
+ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
+@@ -149,6 +156,9 @@ install-data-local:
+ 	if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \
+ 	fi
++	if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \
++		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.conf"; \
++	fi
+ 	if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \
+ 	fi
+diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in
+index b7b62fe..ca188ba 100644
+--- a/misc/rasdaemon.logrotate.in
++++ b/misc/rasdaemon.logrotate.in
+@@ -9,6 +9,7 @@
+ 	copytruncate
+ 	sharedscripts
+ 	postrotate
+-		@sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true
++		(@sbindir@/systemctl is-active --quiet syslog-ng.service && @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1) || true
++		(@sbindir@/systemctl is-active --quiet rsyslog.service &&@sbindir@/systemctl kill -s HUP rsyslog.service >/dev/null 2>&1) || true
+ 	endscript
+ }
+diff --git a/misc/rasdaemon.rsyslog.in b/misc/rasdaemon.rsyslog.in
+new file mode 100644
+index 0000000..d1a5cf1
+--- /dev/null
++++ b/misc/rasdaemon.rsyslog.in
+@@ -0,0 +1,3 @@
++# SPDX-License-Identifier: GPL-2.0
++
++:programname, isequal, "rasdaemon" @localstatedir@/log/rasdaemon
+\ No newline at end of file
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index 8ab3d50..4cc859f 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -54,6 +54,7 @@ install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl
+ install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
+ install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng
+ install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate
++install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog
+ rm INSTALL %{buildroot}/usr/include/*.h
+ 
+ %files
+@@ -67,14 +68,21 @@ rm INSTALL %{buildroot}/usr/include/*.h
+ %config(noreplace) %{_sysconfdir}/sysconfig/%{name}
+ %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng
+ %config(noreplace) /usr/share/%{name}/%{name}.logrotate
++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog
+ 
+ %post
+-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then
++if systemctl is-active --quiet syslog-ng.service; then
+     echo "Syslog service is enabled and running, create config file and restart it";
+     rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+     ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+     systemctl restart syslog-ng.service;
+ fi
++if systemctl is-active --quiet rsyslog.service; then
++    echo "Rsyslog service is enabled and running, create config file and restart it";
++    rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
++    ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf;
++    systemctl restart rsyslog.service;
++fi
+ if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+     rm -rf %{_sysconfdir}/logrotate.d/%{name};
+     ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name};
+@@ -90,11 +98,16 @@ systemctl stop %{name}.service
+ systemctl disable %{name}.service
+ 
+ %postun
+-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then
+-    echo "Syslog service is enabled and running, delete config file and restart it";
++if systemctl is-active --quiet syslog-ng.service; then
++    echo "Syslog-ng service is enabled and running, delete config file and restart it";
+     rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+     systemctl restart syslog-ng.service;
+ fi
++if systemctl is-active --quiet rsyslog.service; then
++    echo "Rsyslog service is enabled and running, delete config file and restart it";
++    rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
++    systemctl restart rsyslog.service;
++fi
+ if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+     rm -rf %{_sysconfdir}/logrotate.d/%{name};
+ fi
+-- 
+2.43.5
+
diff --git a/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1474b7d2c9bd9c96be286d64cb20fac267baa4bf
--- /dev/null
+++ b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch
@@ -0,0 +1,734 @@
+From e14173ad86ac94b9e4af84eaddb1abe3bc6410b7 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 18 Mar 2025 15:25:09 +0800
+Subject: [PATCH] rasdaemon: trace SIGBUS event for hardware error
+
+Kernel will send SIGBUS to program when read DE/UE, use rasdaemon to
+catch this SIGBUS and print it like follow:
+	<...>-71085 [056] d...     0.007781 signal_generate \
+	2025-03-18 15:24:11 +0800 signal: Bus error, errorno: 0, code: 4, \
+	comm: einj_mem_uc, pid: 71085, grp: 0, res: Deliverd, \
+	msg: Hardware memory error consumed: action required
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am          |   5 +-
+ configure.ac         |  11 ++++
+ ras-events.c         |  27 +++++++-
+ ras-events.h         |   1 +
+ ras-record.c         |  75 +++++++++++++++++++++++
+ ras-record.h         |  20 ++++++
+ ras-report.c         |  82 +++++++++++++++++++++++++
+ ras-report.h         |   6 +-
+ ras-signal-handler.c | 143 +++++++++++++++++++++++++++++++++++++++++++
+ ras-signal-handler.h |  30 +++++++++
+ util/ras-mc-ctl.in   |  42 ++++++++++++-
+ 11 files changed, 438 insertions(+), 4 deletions(-)
+ create mode 100644 ras-signal-handler.c
+ create mode 100644 ras-signal-handler.h
+
+diff --git a/Makefile.am b/Makefile.am
+index e3e66bb..1306d97 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -112,6 +112,9 @@ endif
+ if WITH_JAGUAR_NS_DECODE
+    rasdaemon_SOURCES += non-standard-jaguarmicro.c
+ endif
++if WITH_SIGNAL
++   rasdaemon_SOURCES += ras-signal-handler.c
++endif
+ 
+ rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS)
+ rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS)
+@@ -122,7 +125,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+-		  non-standard-jaguarmicro.h trigger.h unified-sel.h
++		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/configure.ac b/configure.ac
+index 1cb00b6..25e0cb2 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -244,6 +244,16 @@ AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes
+ AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"])
+ 
++AC_ARG_ENABLE([signal],
++    AS_HELP_STRING([--enable-signal], [enable signal event(currently experimental)]))
++
++AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_DEFINE(HAVE_SIGNAL,1,"have signal event")
++  AC_SUBST([WITH_SIGNAL])
++])
++AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes])
++AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"])
++
+ test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
+ 
+ CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
+@@ -290,4 +300,5 @@ compile time options summary
+     CPU fault isolation : $USE_CPU_FAULT_ISOLATION
+     YITIAN RAS errors   : $USE_YITIAN_NS_DECODE
+     JAGUAR RAS errors   : $USE_JAGUAR_NS_DECODE
++    Signal              : $USE_SIGNAL
+ EOF
+diff --git a/ras-events.c b/ras-events.c
+index 6692a31..2220e9a 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -34,6 +34,7 @@
+ #include "ras-memory-failure-handler.h"
+ #include "ras-non-standard-handler.h"
+ #include "ras-page-isolation.h"
++#include "ras-signal-handler.h"
+ #include "ras-record.h"
+ #include "trigger.h"
+ 
+@@ -315,6 +316,10 @@ int toggle_ras_mc_event(int enable)
+ 	rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable);
+ #endif
+ 
++#ifdef HAVE_SIGNAL
++	rc |= __toggle_ras_mc_event(ras, "signal", "signal_generate", enable);
++#endif
++
+ free_ras:
+ 	free(ras);
+ 	if (rc)
+@@ -335,7 +340,7 @@ static void setup_event_trigger(char *event)
+ }
+ 
+ #ifdef HAVE_DISKERROR
+-#ifndef HAVE_BLK_RQ_ERROR
++#if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL)
+ /*
+  * Set kernel filter. libtrace doesn't provide an API for setting filters
+  * in kernel, we have to implement it here.
+@@ -943,6 +948,10 @@ int handle_ras_events(int record_events, int enable_ipmitool)
+ #ifdef HAVE_DEVLINK
+ 	char *filter_str = NULL;
+ #endif
++#ifdef HAVE_SIGNAL
++	char signal_filter[64];
++#endif
++
+ 
+ 	ras = calloc(1, sizeof(*ras));
+ 	if (!ras) {
+@@ -1173,6 +1182,22 @@ int handle_ras_events(int record_events, int enable_ipmitool)
+ 		    "cxl", "memory_module");
+ #endif
+ 
++#ifdef HAVE_SIGNAL
++	snprintf(signal_filter, sizeof(signal_filter), "sig == %d && code >= %d", SIGBUS, BUS_OBJERR);
++	// ensure filter enabled
++	usleep(30000);
++	rc = filter_ras_mc_event(ras, "signal", "signal_generate", signal_filter);
++	if (!rc) {
++		rc = add_event_handler(ras, pevent, page_size, "signal", "signal_generate",
++				       ras_signal_event_handler, NULL, SIGNAL_EVENT);
++		if (!rc)
++			num_events++;
++		else if (rc != -EINVAL)
++			log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
++			    "signal", "signal_generate");
++	}
++#endif
++
+ 	if (!num_events) {
+ 		log(ALL, LOG_INFO,
+ 		    "Failed to trace any supported RAS events. Aborting.\n");
+diff --git a/ras-events.h b/ras-events.h
+index 83d41df..1689a12 100644
+--- a/ras-events.h
++++ b/ras-events.h
+@@ -35,6 +35,7 @@ enum {
+ 	CXL_GENERAL_MEDIA_EVENT,
+ 	CXL_DRAM_EVENT,
+ 	CXL_MEMORY_MODULE_EVENT,
++	SIGNAL_EVENT,
+ 	NR_EVENTS
+ };
+ 
+diff --git a/ras-record.c b/ras-record.c
+index eed7aca..31a93a4 100644
+--- a/ras-record.c
++++ b/ras-record.c
+@@ -1142,6 +1142,61 @@ int ras_store_cxl_memory_module_event(struct ras_events *ras,
+ }
+ #endif
+ 
++#ifdef HAVE_SIGNAL
++static const struct db_fields signal_event_fields[] = {
++	{ .name = "id",			.type = "INTEGER PRIMARY KEY" },
++	{ .name = "timestamp",	.type = "TEXT" },
++	{ .name = "sig",		.type = "INTEGER" },
++	{ .name = "errorno",	.type = "INTEGER" },
++	{ .name = "code",		.type = "INTEGER" },
++	{ .name = "comm",		.type = "TEXT" },
++	{ .name = "pid",		.type = "INTEGER" },
++	{ .name = "grp",		.type = "INTEGER" },
++	{ .name = "res",		.type = "INTEGER" },
++
++};
++
++static const struct db_table_descriptor signal_event_tab = {
++	.name = "signal_event",
++	.fields = signal_event_fields,
++	.num_fields = ARRAY_SIZE(signal_event_fields),
++};
++
++int ras_store_signal_event(struct ras_events *ras, struct ras_signal_event *ev)
++{
++	int rc;
++	struct sqlite3_priv *priv = ras->db_priv;
++
++	if (!priv || !priv->stmt_signal_event)
++		return -1;
++	log(TERM, LOG_INFO, "signal_event store: %p\n", priv->stmt_signal_event);
++
++	sqlite3_bind_text(priv->stmt_signal_event,  1, ev->timestamp, -1, NULL);
++	sqlite3_bind_int(priv->stmt_signal_event,  2, ev->sig);
++	sqlite3_bind_int(priv->stmt_signal_event,  3, ev->error_no);
++	sqlite3_bind_int(priv->stmt_signal_event,  4, ev->code);
++	sqlite3_bind_text(priv->stmt_signal_event, 5, ev->comm, -1, NULL);
++	sqlite3_bind_int(priv->stmt_signal_event,  6, ev->pid);
++	sqlite3_bind_int(priv->stmt_signal_event,  7, ev->group);
++	sqlite3_bind_int(priv->stmt_signal_event,  8, ev->result);
++
++	rc = sqlite3_step(priv->stmt_signal_event);
++	if (rc != SQLITE_OK && rc != SQLITE_DONE)
++		log(TERM, LOG_ERR,
++		    "Failed to do signal_event step on sqlite: error = %d\n", rc);
++
++	rc = sqlite3_reset(priv->stmt_signal_event);
++	if (rc != SQLITE_OK && rc != SQLITE_DONE)
++		log(TERM, LOG_ERR,
++		    "Failed reset signal_event on sqlite: error = %d\n",
++		    rc);
++
++	log(TERM, LOG_INFO, "register inserted at db\n");
++
++	return rc;
++}
++#endif
++
+ /*
+  * Generic code
+  */
+@@ -1550,6 +1605,16 @@ int ras_mc_event_opendb(unsigned int cpu, struct ras_events *ras)
+ 	}
+ #endif
+ 
++#ifdef HAVE_SIGNAL
++	rc = ras_mc_create_table(priv, &signal_event_tab);
++	if (rc == SQLITE_OK) {
++		rc = ras_mc_prepare_stmt(priv, &priv->stmt_signal_event,
++					 &signal_event_tab);
++		if (rc != SQLITE_OK)
++			goto error;
++	}
++#endif
++
+ 	ras->db_priv = priv;
+ 	return 0;
+ 
+@@ -1734,6 +1799,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
+ 	}
+ #endif
+ 
++#ifdef HAVE_SIGNAL
++	if (priv->stmt_signal_event) {
++		rc = sqlite3_finalize(priv->stmt_signal_event);
++		if (rc != SQLITE_OK)
++			log(TERM, LOG_ERR,
++			    "cpu %u: Failed to finalize signal_event sqlite: error = %d\n",
++			    cpu, rc);
++	}
++#endif
++
+ 	rc = sqlite3_close_v2(db);
+ 	if (rc != SQLITE_OK)
+ 		log(TERM, LOG_ERR,
+diff --git a/ras-record.h b/ras-record.h
+index eec0702..2dd6630 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -9,6 +9,7 @@
+ #define __RAS_RECORD_H
+ 
+ #include <sqlite3.h>
++#include <fcntl.h>
+ #include <stdbool.h>
+ #include <stdint.h>
+ 
+@@ -258,6 +259,17 @@ struct ras_cxl_memory_module_event {
+ 	uint8_t res_id[CXL_PLDM_RES_ID_LEN];
+ };
+ 
++struct ras_signal_event {
++	char timestamp[64];
++	int sig;
++	int error_no;
++	int code;
++	char *comm;
++	pid_t pid;
++	int group;
++	int result;
++};
++
+ struct ras_mc_event;
+ struct ras_aer_event;
+ struct ras_extlog_event;
+@@ -275,6 +287,7 @@ struct ras_cxl_generic_event;
+ struct ras_cxl_general_media_event;
+ struct ras_cxl_dram_event;
+ struct ras_cxl_memory_module_event;
++struct ras_signal_event;
+ 
+ #ifdef HAVE_SQLITE3
+ 
+@@ -315,6 +328,9 @@ struct sqlite3_priv {
+ 	sqlite3_stmt	*stmt_cxl_dram_event;
+ 	sqlite3_stmt	*stmt_cxl_memory_module_event;
+ #endif
++#ifdef HAVE_SIGNAL
++	sqlite3_stmt	*stmt_signal_event;
++#endif
+ };
+ 
+ struct db_fields {
+@@ -361,6 +377,8 @@ int ras_store_cxl_dram_event(struct ras_events *ras,
+ 			     struct ras_cxl_dram_event *ev);
+ int ras_store_cxl_memory_module_event(struct ras_events *ras,
+ 				      struct ras_cxl_memory_module_event *ev);
++int ras_store_signal_event(struct ras_events *ras,
++			   struct ras_signal_event *ev);
+ 
+ #else
+ static inline int ras_mc_event_opendb(unsigned int cpu,
+@@ -401,6 +419,8 @@ static inline int ras_store_cxl_dram_event(struct ras_events *ras,
+ 					   struct ras_cxl_dram_event *ev) { return 0; };
+ static inline int ras_store_cxl_memory_module_event(struct ras_events *ras,
+ 						    struct ras_cxl_memory_module_event *ev) { return 0; };
++static inline int ras_store_signal_event(struct ras_events *ras,
++					 struct ras_signal_event *ev) { return 0; };
+ 
+ #endif
+ 
+diff --git a/ras-report.c b/ras-report.c
+index 4535421..35d2792 100644
+--- a/ras-report.c
++++ b/ras-report.c
+@@ -13,6 +13,7 @@
+ #include <unistd.h>
+ 
+ #include "ras-report.h"
++#include "ras-record.h"
+ 
+ static int setup_report_socket(void)
+ {
+@@ -735,6 +736,37 @@ static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memor
+ 	return 0;
+ }
+ 
++static int set_signal_event_backtrace(char *buf, struct ras_signal_event *ev)
++{
++	unsigned int size = MAX_BACKTRACE_SIZE;
++
++	if (!buf || !ev)
++		return -1;
++
++	while (*buf && size > 0) {
++		buf++;
++		size--;
++	}
++
++	snprintf(buf, size, "BACKTRACE="
++		"timestamp=%s\n"
++		"signal=%d\n"
++		"errorno=%d\n"
++		"code=%d\n"
++		"comm=%s\n"
++		"grp=%d\n"
++		"res=%d\n",
++		ev->timestamp,
++		ev->sig,
++		ev->error_no,
++		ev->code,
++		ev->comm,
++		ev->group,
++		ev->result);
++
++	return 0;
++}
++
+ static int commit_report_backtrace(int sockfd, int type, void *ev)
+ {
+ 	char buf[MAX_BACKTRACE_SIZE];
+@@ -812,6 +844,10 @@ static int commit_report_backtrace(int sockfd, int type, void *ev)
+ 		rc = set_cxl_memory_module_event_backtrace(buf,
+ 							   (struct ras_cxl_memory_module_event *)ev);
+ 		break;
++	case SIGNAL_EVENT:
++		rc = set_signal_event_backtrace(buf,
++						(struct ras_signal_event *)ev);
++		break;
+ 	default:
+ 		return -1;
+ 	}
+@@ -1552,3 +1588,49 @@ cxl_memory_module_fail:
+ 
+ 	return -1;
+ }
++
++int ras_report_signal_event(struct ras_events *ras,
++			    struct ras_signal_event *ev)
++{
++	char buf[MAX_MESSAGE_SIZE];
++	int sockfd = 0;
++	int done = 0;
++	int rc = -1;
++
++	memset(buf, 0, sizeof(buf));
++
++	sockfd = setup_report_socket();
++	if (sockfd < 0)
++		return -1;
++
++	rc = commit_report_basic(sockfd);
++	if (rc < 0)
++		goto signal_fail;
++
++	rc = commit_report_backtrace(sockfd, SIGNAL_EVENT, ev);
++	if (rc < 0)
++		goto signal_fail;
++
++	snprintf(buf, MAX_MESSAGE_SIZE, "ANALYZER=%s",
++		 "rasdaemon-signal_event");
++	rc = write(sockfd, buf, strlen(buf) + 1);
++	if (rc < strlen(buf) + 1)
++		goto signal_fail;
++
++	snprintf(buf, MAX_MESSAGE_SIZE, "REASON=%s", "SIGBUS for Hardware error");
++	rc = write(sockfd, buf, strlen(buf) + 1);
++	if (rc < strlen(buf) + 1)
++		goto signal_fail;
++
++	done = 1;
++
++signal_fail:
++
++	if (sockfd >= 0)
++		close(sockfd);
++
++	if (done)
++		return 0;
++
++	return -1;
++}
+diff --git a/ras-report.h b/ras-report.h
+index ceb64ce..f680a25 100644
+--- a/ras-report.h
++++ b/ras-report.h
+@@ -57,6 +57,8 @@ int ras_report_cxl_dram_event(struct ras_events *ras,
+ 			      struct ras_cxl_dram_event *ev);
+ int ras_report_cxl_memory_module_event(struct ras_events *ras,
+ 				       struct ras_cxl_memory_module_event *ev);
++int ras_report_signal_event(struct ras_events *ras,
++			    struct ras_signal_event *ev);
+ 
+ #else
+ 
+@@ -108,7 +110,9 @@ static inline int ras_report_cxl_dram_event(struct ras_events *ras,
+ static inline int ras_report_cxl_memory_module_event(struct ras_events *ras,
+ 						     struct ras_cxl_memory_module_event *ev)
+ { return 0; };
+-
++static inline int ras_report_signal_event(struct ras_events *ras,
++					  struct ras_signal_event *ev)
++{ return 0; };
+ #endif
+ 
+ #endif
+diff --git a/ras-signal-handler.c b/ras-signal-handler.c
+new file mode 100644
+index 0000000..fb0bfd3
+--- /dev/null
++++ b/ras-signal-handler.c
+@@ -0,0 +1,143 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2025 Ruidong Tian <tianruidong@linux.alibaba.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++#define _GNU_SOURCE
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <signal.h>
++
++#include "ras-signal-handler.h"
++#include "ras-report.h"
++#include "types.h"
++
++enum {
++	TRACE_SIGNAL_DELIVERED,
++	TRACE_SIGNAL_IGNORED,
++	TRACE_SIGNAL_ALREADY_PENDING,
++	TRACE_SIGNAL_OVERFLOW_FAIL,
++	TRACE_SIGNAL_LOSE_INFO,
++};
++
++static char *signal_msg[] = {
++	[BUS_ADRALN] = "invalid address alignment",
++	[BUS_ADRERR] = "non-existent address",
++	[BUS_OBJERR] = "object-specific hardware error",
++	[BUS_MCEERR_AR] = "Hardware memory error consumed: action required",
++	[BUS_MCEERR_AO] = "Hardware memory error detected in process but not consumed: action optional",
++};
++
++static char *errcode_str[] = {
++	[BUS_ADRALN] = "BUS_ADRALN",
++	[BUS_ADRERR] = "BUS_ADRERR",
++	[BUS_OBJERR] = "BUS_OBJERR",
++	[BUS_MCEERR_AR] = "BUS_MCEERR_AR",
++	[BUS_MCEERR_AO] = "BUS_MCEERR_AO",
++};
++
++static char *signal_res[] = {
++	[TRACE_SIGNAL_DELIVERED] = "Delivered",
++	[TRACE_SIGNAL_IGNORED] = "Ignore",
++	[TRACE_SIGNAL_ALREADY_PENDING] = "Already pending",
++	[TRACE_SIGNAL_OVERFLOW_FAIL] = "Overflow fail",
++	[TRACE_SIGNAL_LOSE_INFO] = "Lose info",
++};
++
++static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev)
++{
++	trace_seq_printf(s,
++			 "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s",
++			 ev->timestamp, strsignal(ev->sig), ev->error_no,
++			 (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code],
++			 ev->comm, ev->pid,
++			 ev->group,
++			 (ev->result < 0 || ev->result > TRACE_SIGNAL_LOSE_INFO) ? "Unknown" : signal_res[ev->result],
++			 ev->sig == SIGBUS ? signal_msg[ev->code] : "Unknown");
++}
++
++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record,
++			     struct tep_event *event, void *context)
++{
++	int len;
++	unsigned long long val;
++	struct ras_events *ras = context;
++	time_t now;
++	struct tm *tm;
++	struct ras_signal_event ev;
++
++	/*
++	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
++	 * On previous kernels, the way to properly generate an event would
++	 * be to inject a fake one, measure its timestamp and diff it against
++	 * gettimeofday. We won't do it here. Instead, let's use uptime,
++	 * falling-back to the event report's time, if "uptime" clock is
++	 * not available (legacy kernels).
++	 */
++
++	if (ras->use_uptime)
++		now = record->ts / user_hz + ras->uptime_diff;
++	else
++		now = time(NULL);
++
++	tm = localtime(&now);
++	if (tm)
++		strftime(ev.timestamp, sizeof(ev.timestamp),
++			 "%Y-%m-%d %H:%M:%S %z", tm);
++
++	if (tep_get_field_val(s,  event, "sig", record, &val, 1) < 0)
++		return -1;
++	ev.sig = val;
++
++	if (tep_get_field_val(s, event, "errno", record, &val, 1) < 0)
++		return -1;
++	ev.error_no = val;
++
++	if (tep_get_field_val(s, event, "code", record, &val, 1) < 0)
++		return -1;
++	ev.code = val;
++
++	ev.comm = tep_get_field_raw(s, event, "comm", record, &len, 1);
++	if (!ev.comm)
++		return -1;
++
++	if (tep_get_field_val(s, event, "pid", record, &val, 1) < 0)
++		return -1;
++	ev.pid = val;
++
++	if (tep_get_field_val(s, event, "group", record, &val, 1) < 0)
++		return -1;
++	ev.group = val;
++
++	if (tep_get_field_val(s, event, "result", record, &val, 1) < 0)
++		return -1;
++	ev.result = val;
++
++	report_ras_signal_event(s, &ev);
++
++	/* Store data into the SQLite DB */
++#ifdef HAVE_SQLITE3
++	ras_store_signal_event(ras, &ev);
++#endif
++
++#ifdef HAVE_ABRT_REPORT
++	/* Report event to ABRT */
++	ras_report_signal_event(ras, &ev);
++#endif
++
++	return 0;
++}
+diff --git a/ras-signal-handler.h b/ras-signal-handler.h
+new file mode 100644
+index 0000000..9740c61
+--- /dev/null
++++ b/ras-signal-handler.h
+@@ -0,0 +1,30 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Copyright (C) 2025 Ruidong Tian <tianruidong@linux.alibaba.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef __RAS_SIGNAL_HANDLER_H
++#define __RAS_SIGNAL_HANDLER_H
++
++#include <traceevent/event-parse.h>
++
++#include "ras-events.h"
++
++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record,
++			     struct tep_event *event, void *context);
++
++#endif
+diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
+index ba48660..648517f 100755
+--- a/util/ras-mc-ctl.in
++++ b/util/ras-mc-ctl.in
+@@ -35,6 +35,7 @@ my $has_disk_errors = 0;
+ my $has_extlog = 0;
+ my $has_mem_failure = 0;
+ my $has_mce = 0;
++my $has_signal = 0;
+ 
+ @WITH_AER_TRUE@$has_aer = 1;
+ @WITH_ARM_TRUE@$has_arm = 1;
+@@ -44,6 +45,7 @@ my $has_mce = 0;
+ @WITH_EXTLOG_TRUE@$has_extlog = 1;
+ @WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1;
+ @WITH_MCE_TRUE@$has_mce = 1;
++@WITH_SIGNAL_TRUE@$has_signal = 1;
+ 
+ my %conf        = ();
+ my %bus         = ();
+@@ -1546,7 +1548,7 @@ sub summary
+ {
+     require DBI;
+     my ($query, $query_handle, $out);
+-    my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
++    my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result, $sigcode);
+     my ($etype, $severity, $etype_string, $severity_string);
+     my ($dev_name, $dev);
+     my ($mpidr, $memdev);
+@@ -1828,6 +1830,24 @@ sub summary
+ 	$query_handle->finish;
+     }
+ 
++    # Signal event
++    if ($has_signal == 1) {
++	$query = "select code, count(*) from signal_event$conf{opt}{since} group by code";
++	$query_handle = $dbh->prepare($query);
++	$query_handle->execute();
++	$query_handle->bind_columns(\($sigcode, $count));
++	$out = "";
++	while($query_handle->fetch()) {
++	    $out .= "\t$sigcode errors: $count\n";
++	}
++	if ($out ne "") {
++	    print "SIGNAL events summary:\n$out\n";
++	} else {
++	    print "No SIGNAL.\n\n";
++	}
++	$query_handle->finish;
++    }
++
+     undef($dbh);
+ }
+ 
+@@ -1849,6 +1869,7 @@ sub errors
+     my ($nibble_mask, $bank_group, $row, $column, $cor_mask);
+     my ($event_type, $event_sub_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status);
+     my ($sub_type, $sub_channel, $cme_threshold_ev_flags, $cme_count, $cvme_count);
++    my ($signal, $errorno, $code, $comm, $pid, $grp, $res);
+ 
+     my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
+ 
+@@ -2366,6 +2387,25 @@ sub errors
+ 	$query_handle->finish;
+     }
+ 
++    # SIGNAL event
++    if ($has_signal == 1) {
++	$query = "select id, timestamp, signal, errorno, code, comm, pid, grp, res from signal_event$conf{opt}{since} order by id";
++	$query_handle = $dbh->prepare($query);
++	$query_handle->execute();
++	$query_handle->bind_columns(\($id, $timestamp, $signal, $errorno, $code, $comm, $pid, $grp, $res));
++	$out = "";
++	while($query_handle->fetch()) {
++	    $out .= "$id $timestamp error: ";
++	    $out .= "signal=$signal, errorno=$errorno, code=$code, comm=$comm, pid=$pid, grp=$grp, res=$res\n";
++	}
++	if ($out ne "") {
++	    print "SIGNAL events:\n$out\n";
++	} else {
++	    print "No SIGNAL event.\n\n";
++	}
++	$query_handle->finish;
++    }
++
+     undef($dbh);
+ }
+ 
+-- 
+2.43.5
+
diff --git a/1004-rasdaemon-align-event-name-in-log.patch b/1004-rasdaemon-align-event-name-in-log.patch
new file mode 100644
index 0000000000000000000000000000000000000000..37f94a48c5ffd4e78cbd48032b64713ef071ec5c
--- /dev/null
+++ b/1004-rasdaemon-align-event-name-in-log.patch
@@ -0,0 +1,34 @@
+From 86a6cbb904a50269c901ba2ed591fde7debfa298 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 18 Mar 2025 15:52:41 +0800
+Subject: [PATCH 04/30] rasdaemon: align event name in log
+
+Now rasdaemon event name is not align in log:
+
+  <...>-52503 [070] dNh.     0.007127 arm_event ...
+  <...>-52503 [052] ....     0.007127 memory_failure_event ...
+Align it and result look like:
+  <...>-113714     [059] dNh.     0.007942                 arm_event: ...
+  <...>-113714     [069] ....     0.007942      memory_failure_event: ...
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ ras-events.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/ras-events.c b/ras-events.c
+index 2220e9a..88c8a5f 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -418,7 +418,7 @@ static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf,
+ 	/* TODO - logging */
+ 	trace_seq_init(&s);
+ 	tep_print_event(pdata->ras->pevent, &s, &record,
+-			"%16s-%-5d [%03d] %s %6.1000d %s %s",
++			"%16s-%-10d [%03d] %s %6.1000d %25s: %s",
+ 			TEP_PRINT_COMM, TEP_PRINT_PID, TEP_PRINT_CPU,
+ 			TEP_PRINT_LATENCY, TEP_PRINT_TIME, TEP_PRINT_NAME,
+ 			TEP_PRINT_INFO);
+-- 
+2.43.5
+
diff --git a/1005-rasdaemon-skip-doesn-t-exist-event.patch b/1005-rasdaemon-skip-doesn-t-exist-event.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1f6cbd2a611556a1f826bffbff65a0896af068f9
--- /dev/null
+++ b/1005-rasdaemon-skip-doesn-t-exist-event.patch
@@ -0,0 +1,56 @@
+From 7a13978040e6aa3e841cbbd5e6f91e5f98ae8d82 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 25 Mar 2025 10:16:13 +0800
+Subject: [PATCH 05/30] rasdaemon: skip doesn't exist event
+
+When compiling rasdaemon with the --enable-all configuration flag,
+the system may detect unsupported hardware events - for instance,
+ARM-specific events on x86 architectures. This causes the program
+to enter a busy-wait loop in the wait_access function. A better
+approach would be to explicitly skip these architecture-mismatched
+events during initialization.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ ras-events.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/ras-events.c b/ras-events.c
+index 88c8a5f..d42ed9f 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -826,6 +826,18 @@ static int select_tracing_timestamp(struct ras_events *ras)
+ 	return 0;
+ }
+ 
++static bool check_event_exist(struct ras_events *ras, char *group, char *event)
++{
++	char fname[MAX_PATH + 256];
++
++	snprintf(fname, sizeof(fname), "%s/tracing/events/%s/%s",
++		 ras->debugfs, group, event);
++	if (access(fname, F_OK) == 0)
++		return true;
++
++	return false;
++}
++
+ #define EVENT_DISABLED	1
+ 
+ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
+@@ -837,6 +849,12 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
+ 	char *page, fname[MAX_PATH + 1];
+ 	struct tep_event_filter *filter = NULL;
+ 
++	if (!check_event_exist(ras, group, event)) {
++		log(ALL, LOG_WARNING, "%s:%s event not exist\n",
++		    group, event);
++		return -EINVAL;
++	}
++
+ 	snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event);
+ 
+ 	fd = open_trace(ras, fname, O_RDONLY);
+-- 
+2.43.5
+
diff --git a/1006-rasdaemon-support-memory-corrected-error-statistics.patch b/1006-rasdaemon-support-memory-corrected-error-statistics.patch
new file mode 100644
index 0000000000000000000000000000000000000000..845a3600e10148471f43eae1d8d2bce288cf1cb3
--- /dev/null
+++ b/1006-rasdaemon-support-memory-corrected-error-statistics.patch
@@ -0,0 +1,124 @@
+From 32bd3dc84cc235dc589ae6ac149a3567c7b501a6 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 25 Mar 2025 18:36:07 +0800
+Subject: [PATCH 06/30] rasdaemon: support memory corrected error statistics
+
+A high volume of Correctable Errors (CEs) indicates that the
+memory controller is frequently performing Error-Correcting Code (ECC)
+operations, which will increase memory controller latency.
+The CE statistics feature can report the number of CEs occurring per
+second. When the count exceeds a certain threshold, it signifies
+intensive ECC activity and triggers warnings.
+
+New environment MC_CE_STAT_THRESHOLD to setup threshold.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ misc/rasdaemon.env |  5 +++++
+ ras-mc-handler.c   | 23 +++++++++++++++++++++++
+ ras-mc-handler.h   |  1 +
+ rasdaemon.c        |  7 +++++++
+ 4 files changed, 36 insertions(+)
+
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 963aaa0..4375781 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -88,3 +88,8 @@ TRIGGER_DIR=
+ #   MC_UE_TRIGGER=mc_event_trigger
+ MC_CE_TRIGGER=
+ MC_UE_TRIGGER=
++
++# CE Statistic Threshold
++#
++# Specify the threshold of CE per second.
++MC_CE_STAT_THRESHOLD=2000
+\ No newline at end of file
+diff --git a/ras-mc-handler.c b/ras-mc-handler.c
+index fdd85a9..7a18f73 100644
+--- a/ras-mc-handler.c
++++ b/ras-mc-handler.c
+@@ -103,6 +103,27 @@ free:
+ 		free(env[i]);
+ }
+ 
++static unsigned long long per_sec_ce_count;
++unsigned long long mc_ce_stat_threshold;
++static time_t cur;
++static int ras_mc_event_stat(time_t now, struct ras_mc_event *e)
++{
++	if (strcmp(e->error_type, "Corrected"))
++		return 0;
++
++	if (cur == now) {
++		per_sec_ce_count += e->error_count;
++	} else {
++		cur = now;
++		per_sec_ce_count = e->error_count;
++	}
++
++	if (per_sec_ce_count > mc_ce_stat_threshold)
++		log(ALL, LOG_ERR, "    mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count);
++
++	return 0;
++}
++
+ int ras_mc_event_handler(struct trace_seq *s,
+ 			 struct tep_record *record,
+ 			 struct tep_event *event, void *context)
+@@ -263,6 +284,8 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 
+ 	ras_store_mc_event(ras, &ev);
+ 
++	ras_mc_event_stat(now, &ev);
++
+ #ifdef HAVE_MEMORY_CE_PFA
+ 	/* Account page corrected errors */
+ 	if (!strcmp(ev.error_type, "Corrected"))
+diff --git a/ras-mc-handler.h b/ras-mc-handler.h
+index 2aa3c28..cf12959 100644
+--- a/ras-mc-handler.h
++++ b/ras-mc-handler.h
+@@ -10,6 +10,7 @@
+ #include <traceevent/event-parse.h>
+ 
+ #include "ras-events.h"
++extern unsigned long long mc_ce_stat_threshold;
+ 
+ void mc_event_trigger_setup(void);
+ 
+diff --git a/rasdaemon.c b/rasdaemon.c
+index 840be61..d97665f 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -13,6 +13,7 @@
+ #include "ras-events.h"
+ #include "ras-logger.h"
+ #include "ras-record.h"
++#include "ras-mc-handler.h"
+ #include "types.h"
+ 
+ /*
+@@ -23,6 +24,7 @@
+ #define TOOL_DESCRIPTION "RAS daemon to log the RAS events."
+ #define ARGS_DOC "<options>"
+ #define DISABLE "DISABLE"
++#define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD"
+ 
+ const char *argp_program_version = TOOL_NAME " " VERSION;
+ const char *argp_program_bug_address = "Mauro Carvalho Chehab <mchehab@kernel.org>";
+@@ -126,6 +128,11 @@ int main(int argc, char *argv[])
+ 
+ 	choices_disable = getenv(DISABLE);
+ 
++	if (getenv(MC_CE_STAT_THRESHOLD))
++		mc_ce_stat_threshold = strtoull(getenv(MC_CE_STAT_THRESHOLD), NULL, 0);
++	if (mc_ce_stat_threshold)
++		log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold);
++
+ #ifdef HAVE_MCE
+ 	const struct argp_option offline_options[] = {
+ 		{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+-- 
+2.43.5
+
diff --git a/1007-rasdaemon-introduce-poison-page-statistics.patch b/1007-rasdaemon-introduce-poison-page-statistics.patch
new file mode 100644
index 0000000000000000000000000000000000000000..12dd35e30ffa21be01e8f81b6af46496eee12fd5
--- /dev/null
+++ b/1007-rasdaemon-introduce-poison-page-statistics.patch
@@ -0,0 +1,249 @@
+From 9e9a9b7cd802f7874f674fb024ef0dd93e223060 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Wed, 26 Mar 2025 14:03:33 +0800
+Subject: [PATCH 07/30] rasdaemon: introduce poison page statistics
+
+An excessive number of poison pages can lead to memory fragmentation,
+which may degrade system performance. This patch introduces a threshold
+monitoring mechanism for poison pages. When the number of poison pages
+exceeds the predefined threshold, a warning is issued to alert
+administrators.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                  |  7 +++++-
+ configure.ac                 |  6 ++++++
+ misc/rasdaemon.env           |  8 ++++++-
+ ras-memory-failure-handler.c |  5 +++++
+ ras-memory-failure-handler.h |  2 ++
+ ras-page-isolation.c         |  6 ++++++
+ ras-poison-page-stat.c       | 41 ++++++++++++++++++++++++++++++++++++
+ ras-poison-page-stat.h       | 14 ++++++++++++
+ rasdaemon.c                  |  9 ++++++++
+ 9 files changed, 96 insertions(+), 2 deletions(-)
+ create mode 100644 ras-poison-page-stat.c
+ create mode 100644 ras-poison-page-stat.h
+
+diff --git a/Makefile.am b/Makefile.am
+index 1306d97..56e992d 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -116,6 +116,10 @@ if WITH_SIGNAL
+    rasdaemon_SOURCES += ras-signal-handler.c
+ endif
+ 
++if WITH_POISON_PAGE_STAT
++   rasdaemon_SOURCES += ras-poison-page-stat.c
++endif
++
+ rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS)
+ rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS)
+ 
+@@ -125,7 +129,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+-		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h
++		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \
++		  ras-poison-page-stat.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/configure.ac b/configure.ac
+index 25e0cb2..5fe1862 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -254,6 +254,12 @@ AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [
+ AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"])
+ 
++AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_DEFINE(HAVE_POISON_PAGE_STAT,1,"have poison page statistics")
++  AC_SUBST([WITH_POISON_PAGE_STAT])
++])
++AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ])
++
+ test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
+ 
+ CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 4375781..3aa3a0d 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -92,4 +92,10 @@ MC_UE_TRIGGER=
+ # CE Statistic Threshold
+ #
+ # Specify the threshold of CE per second.
+-MC_CE_STAT_THRESHOLD=2000
+\ No newline at end of file
++MC_CE_STAT_THRESHOLD=2000
++
++# Poison page statistics
++#
++# Supported units:
++# POISON_STAT_THRESHOLD: kB
++POISON_STAT_THRESHOLD=102400
+diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
+index 4d20ce8..d4c293b 100644
+--- a/ras-memory-failure-handler.c
++++ b/ras-memory-failure-handler.c
+@@ -12,6 +12,7 @@
+ 
+ #include "ras-logger.h"
+ #include "ras-memory-failure-handler.h"
++#include "ras-poison-page-stat.h"
+ #include "ras-report.h"
+ #include "trigger.h"
+ #include "types.h"
+@@ -208,6 +209,10 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
+ 	ev.action_result = get_action_result(val);
+ 	trace_seq_printf(s, "action_result=%s ", ev.action_result);
+ 
++#ifdef HAVE_POISON_PAGE_STAT
++	ras_poison_page_stat();
++#endif
++
+ 	/* Store data into the SQLite DB */
+ #ifdef HAVE_SQLITE3
+ 	ras_store_mf_event(ras, &ev);
+diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h
+index f0cea71..85e2dd2 100644
+--- a/ras-memory-failure-handler.h
++++ b/ras-memory-failure-handler.h
+@@ -11,6 +11,8 @@
+ 
+ #include "ras-events.h"
+ 
++extern unsigned long long poison_stat_threshold;
++
+ void mem_fail_event_trigger_setup(void);
+ int ras_memory_failure_event_handler(struct trace_seq *s,
+ 				     struct tep_record *record,
+diff --git a/ras-page-isolation.c b/ras-page-isolation.c
+index 2166f5c..246cd12 100644
+--- a/ras-page-isolation.c
++++ b/ras-page-isolation.c
+@@ -15,6 +15,8 @@
+ 
+ #include "ras-logger.h"
+ #include "ras-page-isolation.h"
++#include "ras-poison-page-stat.h"
++#include "ras-record.h"
+ 
+ #define PARSED_ENV_LEN 50
+ #define ROW_ID_MAX_LEN 200
+@@ -349,6 +351,10 @@ static void page_offline(struct page_record *pr)
+ 
+ 	log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n",
+ 	    addr, page_state[pr->offlined]);
++
++#ifdef HAVE_POISON_PAGE_STAT
++	ras_poison_page_stat();
++#endif
+ }
+ 
+ static void page_record(struct page_record *pr, unsigned int count, time_t time)
+diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c
+new file mode 100644
+index 0000000..2ce1d2a
+--- /dev/null
++++ b/ras-poison-page-stat.c
+@@ -0,0 +1,41 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <sys/syslog.h>
++
++#include "ras-logger.h"
++#include "ras-poison-page-stat.h"
++#include "types.h"
++
++unsigned long long poison_stat_threshold;
++int ras_poison_page_stat(void)
++{
++	FILE *fp;
++	char line[MAX_PATH];
++	unsigned long long corrupted_kb = 0;
++
++	fp = fopen("/proc/meminfo", "r");
++	if (!fp) {
++		log(ALL, LOG_ERR, "Failed to open /proc/meminfo");
++		return EXIT_FAILURE;
++	}
++
++	while (fgets(line, sizeof(line), fp))
++		if (strstr(line, "HardwareCorrupted"))
++			if (sscanf(line, "%*s %llukB", &corrupted_kb) == 1)
++				break;
++
++	fclose(fp);
++
++	if (corrupted_kb > poison_stat_threshold)
++		log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n",
++		    corrupted_kb, poison_stat_threshold);
++
++	return 0;
++}
+diff --git a/ras-poison-page-stat.h b/ras-poison-page-stat.h
+new file mode 100644
+index 0000000..4fe25d2
+--- /dev/null
++++ b/ras-poison-page-stat.h
+@@ -0,0 +1,14 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#ifndef __RAS_POISON_PAGE_STAT_H
++#define __RAS_POISON_PAGE_STAT_H
++
++extern unsigned long long poison_stat_threshold;
++
++int ras_poison_page_stat(void);
++
++#endif
+diff --git a/rasdaemon.c b/rasdaemon.c
+index d97665f..6505dee 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -12,6 +12,7 @@
+ 
+ #include "ras-events.h"
+ #include "ras-logger.h"
++#include "ras-poison-page-stat.h"
+ #include "ras-record.h"
+ #include "ras-mc-handler.h"
+ #include "types.h"
+@@ -25,6 +26,7 @@
+ #define ARGS_DOC "<options>"
+ #define DISABLE "DISABLE"
+ #define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD"
++#define POISON_STAT_THRESHOLD "POISON_STAT_THRESHOLD"
+ 
+ const char *argp_program_version = TOOL_NAME " " VERSION;
+ const char *argp_program_bug_address = "Mauro Carvalho Chehab <mchehab@kernel.org>";
+@@ -133,6 +135,13 @@ int main(int argc, char *argv[])
+ 	if (mc_ce_stat_threshold)
+ 		log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold);
+ 
++#ifdef HAVE_POISON_PAGE_STAT
++	if (getenv(POISON_STAT_THRESHOLD))
++		poison_stat_threshold = strtoull(getenv(POISON_STAT_THRESHOLD), NULL, 0);
++	if (poison_stat_threshold)
++		log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold);
++#endif
++
+ #ifdef HAVE_MCE
+ 	const struct argp_option offline_options[] = {
+ 		{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+-- 
+2.43.5
+
diff --git a/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ee72a7446bf0c4fe82dea7b494a4d696e6295268
--- /dev/null
+++ b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch
@@ -0,0 +1,468 @@
+From d64ff047a5ab231ee6c1a797dc3ce612fb7a5a6c Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 12 Dec 2024 09:37:06 +0800
+Subject: [PATCH 08/30] rasdaemon: erst: decode panic mce through erst
+
+ERST records the MCE information that caused the kernel panic,
+helping us determine the cause of the last crash.
+Using rasdaemon to check and parse the ERST records at startup.
+Decoded info like follow:
+           <...>-0          [-01] .... 0.000000           mce_erst_record: 2025-03-26 14:52:42 +0800 bank=1, status= bd80000000100134, Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error, mci=Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR, mca=Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error K, cpu_type= Sapphirerapids server, cpu= 159, socketid= 1, ip= ffffffff914a6476, cs= 10, misc= 86, addr= 8158f58400, mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE, mcgcap= f000c15, apicid= 9f, ppin= fc6b80e0ba9d616, microcode= 2b000571
+
+Now environment ERST_DELETE is introduced, rasdaemon will delete
+origin erst file if ERST_DELETE set.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |   5 +-
+ configure.ac       |  11 +++
+ misc/rasdaemon.env |   2 +
+ ras-erst.c         | 195 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-erst.h         |  17 ++++
+ ras-mce-handler.c  |  35 ++++++--
+ ras-mce-handler.h  |   4 +
+ ras-record.h       |   4 +
+ rasdaemon.c        |  11 +++
+ 9 files changed, 275 insertions(+), 9 deletions(-)
+ create mode 100644 ras-erst.c
+ create mode 100644 ras-erst.h
+
+diff --git a/Makefile.am b/Makefile.am
+index 56e992d..e1bcda1 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -119,6 +119,9 @@ endif
+ if WITH_POISON_PAGE_STAT
+    rasdaemon_SOURCES += ras-poison-page-stat.c
+ endif
++if WITH_ERST
++   rasdaemon_SOURCES += ras-erst.c
++endif
+ 
+ rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS)
+ rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS)
+@@ -130,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+ 		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \
+-		  ras-poison-page-stat.h
++		  ras-poison-page-stat.h ras-erst.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/configure.ac b/configure.ac
+index 5fe1862..47e6346 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -260,6 +260,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pf
+ ])
+ AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ])
+ 
++AC_ARG_ENABLE([erst],
++    AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)]))
++
++AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_DEFINE(HAVE_ERST,1,"have ERST")
++  AC_SUBST([WITH_ERST])
++])
++AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes])
++AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"])
++
+ test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
+ 
+ CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
+@@ -307,4 +317,5 @@ compile time options summary
+     YITIAN RAS errors   : $USE_YITIAN_NS_DECODE
+     JAGUAR RAS errors   : $USE_JAGUAR_NS_DECODE
+     Signal              : $USE_SIGNAL
++    ERST                : $USE_ERST
+ EOF
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 3aa3a0d..193ee19 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -99,3 +99,5 @@ MC_CE_STAT_THRESHOLD=2000
+ # Supported units:
+ # POISON_STAT_THRESHOLD: kB
+ POISON_STAT_THRESHOLD=102400
++
++ERST_DELETE=1
+diff --git a/ras-erst.c b/ras-erst.c
+new file mode 100644
+index 0000000..c024d60
+--- /dev/null
++++ b/ras-erst.c
+@@ -0,0 +1,195 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include <dirent.h>
++#include <stdlib.h>
++#include <sys/stat.h>
++#include <unistd.h>
++
++#include "ras-events.h"
++#include "ras-erst.h"
++#include "ras-logger.h"
++#include "ras-mce-handler.h"
++#include "ras-record.h"
++#include "types.h"
++
++struct mce {
++	uint64_t status;		/* Bank's MCi_STATUS MSR */
++	uint64_t misc;		/* Bank's MCi_MISC MSR */
++	uint64_t addr;		/* Bank's MCi_ADDR MSR */
++	uint64_t mcgstatus;	/* Machine Check Global Status MSR */
++	uint64_t ip;		/* Instruction Pointer when the error happened */
++	uint64_t tsc;		/* CPU time stamp counter */
++	uint64_t time;		/* Wall time_t when error was detected */
++	uint8_t  cpuvendor;	/* Kernel's X86_VENDOR enum */
++	uint8_t  inject_flags;	/* Software inject flags */
++	uint8_t  severity;		/* Error severity */
++	uint8_t  pad;
++	uint32_t cpuid;		/* CPUID 1 EAX */
++	uint8_t  cs;		/* Code segment */
++	uint8_t  bank;		/* Machine check bank reporting the error */
++	uint8_t  cpu;		/* CPU number; obsoleted by extcpu */
++	uint8_t  finished;		/* Entry is valid */
++	uint32_t extcpu;		/* Linux CPU number that detected the error */
++	uint32_t socketid;		/* CPU socket ID */
++	uint32_t apicid;		/* CPU initial APIC ID */
++	uint64_t mcgcap;		/* MCGCAP MSR: machine check capabilities of CPU */
++	uint64_t synd;		/* MCA_SYND MSR: only valid on SMCA systems */
++	uint64_t ipid;		/* MCA_IPID MSR: only valid on SMCA systems */
++	uint64_t ppin;		/* Protected Processor Inventory Number */
++	uint32_t microcode;	/* Microcode revision */
++};
++
++static int erst_delete;
++
++#define ERST_PATH "/sys/fs/pstore/erst"
++#define MCE_ERST_PREFIX "mce-erst"
++#define ERST_EVENT_NAME "mce_erst_record"
++
++#ifdef HAVE_MCE
++static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e)
++{
++	struct mce_priv *mce = ras->mce_priv;
++	struct trace_seq s;
++	int rc = 0;
++
++	switch (mce->cputype) {
++	case CPU_GENERIC:
++		break;
++	case CPU_K8:
++		rc = parse_amd_k8_event(ras, e);
++		break;
++	case CPU_AMD_SMCA:
++	case CPU_DHYANA:
++		rc = parse_amd_smca_event(ras, e);
++		break;
++	default:			/* All other CPU types are Intel */
++		rc = parse_intel_event(ras, e);
++	}
++
++	if (rc)
++		return;
++
++	mce_snprintf(e->error_msg, "%s", e->mcastatus_msg);
++
++	trace_seq_init(&s);
++	trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
++			 "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME);
++
++	report_mce_event(ras, NULL, &s, e);
++	trace_seq_terminate(&s);
++	trace_seq_do_printf(&s);
++	printf("\n");
++	fflush(stdout);
++	trace_seq_destroy(&s);
++}
++
++static void handle_erst_mce_file(char *path, struct mce_event *e)
++{
++	FILE *file;
++	struct mce mce;
++	struct stat file_stat;
++
++	file = fopen(path, "r");
++	if (!file) {
++		log(ALL, LOG_ERR, "Failed to open file %s\n", path);
++		return;
++	}
++
++	if (stat(path, &file_stat) < 0) {
++		log(ALL, LOG_ERR, "Failed to stat file %s\n", path);
++		goto out;
++	}
++
++	if (fread((char *)&mce, 1, sizeof(mce), file) < sizeof(mce)) {
++		log(ALL, LOG_ERR, "Failed to read file %s\n", path);
++		goto out;
++	}
++
++	e->mcgcap = mce.mcgcap;
++	e->mcgstatus = mce.mcgstatus;
++
++	e->status = mce.status;
++	e->addr = mce.addr;
++	e->misc = mce.misc;
++	e->synd = mce.synd;
++	e->ipid = mce.ipid;
++	e->ip = mce.ip;
++	e->tsc = mce.tsc;
++	e->walltime = mce.time;
++	e->cpu = mce.extcpu;
++	e->cpuid = mce.cpuid;
++	e->apicid = mce.apicid;
++	e->socketid = mce.socketid;
++	e->cs = mce.cs;
++	e->bank = mce.bank;
++	e->cpuvendor = mce.cpuvendor;
++	e->ppin = mce.ppin;
++	e->microcode = mce.microcode;
++
++	if (erst_delete) {
++		if (!unlink(path))
++			log(ALL, LOG_INFO, "Error deleting file %s\n", path);
++		else
++			log(ALL, LOG_ERR, "Failed to delete file %s\n", path);
++	}
++
++out:
++	fclose(file);
++}
++
++static void handle_erst_mce(void)
++{
++	int rc;
++	struct ras_events ras = { 0 };
++	struct dirent *entry;
++	DIR *dir;
++
++	rc = init_mce_priv(&ras);
++	if (rc) {
++		log(ALL, LOG_INFO, "Can't register mce handler\n");
++		return;
++	}
++
++	dir = opendir(ERST_PATH);
++	if (!dir) {
++		log(ALL, LOG_INFO, "Failed to open directory\n");
++		return;
++	}
++
++	while ((entry = readdir(dir)) != NULL) {
++		struct stat path_stat;
++		char file_path[MAX_PATH];
++		struct mce_event mce = { 0 };
++
++		mce.erst = 1;
++		if (strncmp(entry->d_name, MCE_ERST_PREFIX, strlen(MCE_ERST_PREFIX)))
++			continue;
++
++		snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name);
++		stat(file_path, &path_stat);
++
++		if (S_ISREG(path_stat.st_mode)) {
++			handle_erst_mce_file(file_path, &mce);
++		} else {
++			log(TERM, LOG_ERR, "Unexpected file type\n");
++			continue;
++		}
++
++		ras_erst_mce_handler(&ras, &mce);
++	}
++
++	closedir(dir);
++}
++#endif
++/* ERST just support mce now */
++void handle_erst(void)
++{
++	if (getenv(ERST_DELETE))
++		erst_delete = atoi(getenv(ERST_DELETE));
++
++	handle_erst_mce();
++}
+diff --git a/ras-erst.h b/ras-erst.h
+new file mode 100644
+index 0000000..83d7535
+--- /dev/null
++++ b/ras-erst.h
+@@ -0,0 +1,17 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#ifndef __RAS_ERST_H
++#define __RAS_ERST_H
++
++#define ERST_DELETE	"ERST_DELETE"
++
++#ifdef HAVE_MCE
++void handle_erst_mce(void);
++#endif
++
++void handle_erst(void);
++#endif
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index 8713390..3d8d97d 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -228,7 +228,7 @@ ret:
+ 	return ret;
+ }
+ 
+-int register_mce_handler(struct ras_events *ras, unsigned int ncpus)
++int init_mce_priv(struct ras_events *ras)
+ {
+ 	int rc;
+ 	struct mce_priv *mce;
+@@ -249,6 +249,11 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus)
+ 		ras->mce_priv = NULL;
+ 		return rc;
+ 	}
++
++	return rc;
++}
++static void set_imc_log(struct mce_priv *mce, unsigned int ncpus)
++{
+ 	switch (mce->cputype) {
+ 	case CPU_SANDY_BRIDGE_EP:
+ 	case CPU_IVY_BRIDGE_EPEX:
+@@ -259,6 +264,17 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus)
+ 	default:
+ 		break;
+ 	}
++}
++
++int register_mce_handler(struct ras_events *ras, unsigned int ncpus)
++{
++	int rc;
++
++	rc = init_mce_priv(ras);
++	if (rc)
++		return rc;
++
++	set_imc_log(ras->mce_priv, ncpus);
+ 
+ 	return rc;
+ }
+@@ -267,9 +283,8 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus)
+  * End of mcelog's code
+  */
+ 
+-static void report_mce_event(struct ras_events *ras,
+-			     struct tep_record *record,
+-			     struct trace_seq *s, struct mce_event *e)
++void report_mce_event(struct ras_events *ras, struct tep_record *record,
++		      struct trace_seq *s, struct mce_event *e)
+ {
+ 	time_t now;
+ 	struct tm *tm;
+@@ -284,10 +299,14 @@ static void report_mce_event(struct ras_events *ras,
+ 	 * not available (legacy kernels).
+ 	 */
+ 
+-	if (ras->use_uptime)
+-		now = record->ts / user_hz + ras->uptime_diff;
+-	else
+-		now = time(NULL);
++	if (!e->erst) {
++		if (ras->use_uptime)
++			now = record->ts / user_hz + ras->uptime_diff;
++		else
++			now = time(NULL);
++	} else {
++		now = e->walltime;
++	}
+ 
+ 	tm = localtime(&now);
+ 	if (tm)
+diff --git a/ras-mce-handler.h b/ras-mce-handler.h
+index 57984ec..f120874 100644
+--- a/ras-mce-handler.h
++++ b/ras-mce-handler.h
+@@ -78,6 +78,7 @@ struct mce_event {
+ 	char		mcastatus_msg[1024];
+ 	char		user_action[4096];
+ 	char		mc_location[256];
++	int		erst;
+ };
+ 
+ struct mce_priv {
+@@ -108,6 +109,7 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus);
+ int ras_mce_event_handler(struct trace_seq *s,
+ 			  struct tep_record *record,
+ 			  struct tep_event *event, void *context);
++int init_mce_priv(struct ras_events *ras);
+ 
+ /* enables intel iMC logs */
+ int set_intel_imc_log(enum cputype cputype, unsigned int ncpus);
+@@ -170,4 +172,6 @@ int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
+ 
+ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
+ 
++void report_mce_event(struct ras_events *ras, struct tep_record *record,
++		      struct trace_seq *s, struct mce_event *e);
+ #endif
+diff --git a/ras-record.h b/ras-record.h
+index 2dd6630..eb5b838 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -28,6 +28,7 @@ struct ras_mc_event {
+ 	signed char top_layer, middle_layer, lower_layer;
+ 	unsigned long long address, grain, syndrome;
+ 	const char *driver_detail;
++	int erst;
+ };
+ 
+ struct ras_mc_offline_event {
+@@ -46,6 +47,9 @@ struct ras_aer_event {
+ 	uint8_t tlp_header_valid;
+ 	uint32_t *tlp_header;
+ 	const char *msg;
++	int erst;
++	uint16_t vendor_id;
++	uint16_t device_id;
+ };
+ 
+ struct ras_extlog_event {
+diff --git a/rasdaemon.c b/rasdaemon.c
+index 6505dee..be5c390 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -10,6 +10,7 @@
+ #include <string.h>
+ #include <unistd.h>
+ 
++#include "ras-erst.h"
+ #include "ras-events.h"
+ #include "ras-logger.h"
+ #include "ras-poison-page-stat.h"
+@@ -225,6 +226,16 @@ int main(int argc, char *argv[])
+ 		if (daemon(0, 0))
+ 			exit(EXIT_FAILURE);
+ 
++#ifdef HAVE_ERST
++#ifdef HAVE_MCE
++	if (choices_disable && strlen(choices_disable) != 0 &&
++	    strstr(choices_disable, "ras:erst"))
++		log(ALL, LOG_INFO, "Disabled ras:erst from config\n");
++	else
++		handle_erst();
++#endif
++#endif
++
+ 	handle_ras_events(args.record_events, args.enable_ipmitool);
+ 
+ 	return 0;
+-- 
+2.43.5
+
diff --git a/1009-aer-print-pci-device-name-and-vendor-device-id.patch b/1009-aer-print-pci-device-name-and-vendor-device-id.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ac5177eaf1b87638bb38f1f189dd066dcd024b01
--- /dev/null
+++ b/1009-aer-print-pci-device-name-and-vendor-device-id.patch
@@ -0,0 +1,166 @@
+From 5d8df52470036771ee97fa93ea0abcf3c3fbb3f3 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 27 Mar 2025 17:27:38 +0800
+Subject: [PATCH 09/30] aer: print pci device name and vendor/device id
+
+New aer log like follow:
+
+	<...>-2682840 [125] ....     0.017661 aer_event 2025-03-27
+17:34:44 +0800 0000:99:00.0 (Intel Corporation Device 0b60 -
+vendor_id: 0x8086 device_id: 0xb60) Data Link Protocol Uncorrected
+(Non-Fatal)
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am            |  4 ++--
+ configure.ac           |  8 ++++++++
+ misc/rasdaemon.spec.in |  2 ++
+ ras-aer-handler.c      | 46 +++++++++++++++++++++++++++++++++++++++++-
+ ras-record.h           |  2 +-
+ 5 files changed, 58 insertions(+), 4 deletions(-)
+
+diff --git a/Makefile.am b/Makefile.am
+index e1bcda1..2911a21 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -123,8 +123,8 @@ if WITH_ERST
+    rasdaemon_SOURCES += ras-erst.c
+ endif
+ 
+-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS)
+-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS)
++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS)
++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS)
+ 
+ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
+diff --git a/configure.ac b/configure.ac
+index 47e6346..3603c7f 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -54,6 +54,14 @@ AC_ARG_ENABLE([aer],
+ AS_IF([test "x$enable_aer" = "xyes" || test "x$enable_all" = "xyes"], [
+   AC_DEFINE(HAVE_AER,1,"have PCIe AER events collect")
+   AC_SUBST([WITH_AER])
++
++  has_libpci_ver=0
++  dnl check for pciutils library
++  PKG_CHECK_MODULES([LIBPCI], [libpci], [has_libpci_ver=1])
++
++  AS_IF([test "$has_libpci_ver" -eq 0], [
++    AC_MSG_ERROR([libpci is required but were not found])
++])
+ ])
+ AM_CONDITIONAL([WITH_AER], [test x$enable_aer = xyes || test x$enable_all = xyes])
+ AM_COND_IF([WITH_AER], [USE_AER="yes"], [USE_AER="no"])
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index 4cc859f..a30045c 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -17,10 +17,12 @@ BuildRequires:		perl-generators
+ BuildRequires:		sqlite-devel
+ BuildRequires:		systemd
+ BuildRequires:		libtraceevent-devel
++BuildRequires:		pciutils-devel
+ Provides:		bundled(kernel-event-lib)
+ Requires:		hwdata
+ Requires:		perl-DBD-SQLite
+ Requires:		libtraceevent
++Requires:		pciutils-devel
+ %ifarch %{ix86} x86_64
+ Requires:		dmidecode
+ %endif
+diff --git a/ras-aer-handler.c b/ras-aer-handler.c
+index 5d069f3..53acbc8 100644
+--- a/ras-aer-handler.c
++++ b/ras-aer-handler.c
+@@ -4,6 +4,7 @@
+  * Copyright (C) 2013 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+  */
+ 
++#include <pci/pci.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+@@ -63,6 +64,45 @@ void ras_aer_handler_init(int enable_ipmitool)
+ 
+ #define BUF_LEN	1024
+ 
++static void get_pci_dev_name(char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id)
++{
++	struct pci_access *pacc;
++	struct pci_dev *dev;
++	struct pci_filter filter = {0};
++	char *err;
++
++	if (!pci_name)
++		return;
++
++	pacc = pci_alloc();
++	if (!pacc)
++		return;
++
++	pci_init(pacc);
++	pci_scan_bus(pacc);
++	pci_filter_init(pacc, &filter);
++	err = pci_filter_parse_slot(&filter, bdf);
++	if (err) {
++		log(TERM, LOG_ERR, "Invalid PCI device name %s\n", bdf);
++		goto free;
++	}
++
++	for (dev = pacc->devices; dev; dev = dev->next) {
++		if (pci_filter_match(&filter, dev)) {
++			pci_fill_info(dev, PCI_FILL_IDENT);
++			*vendor_id = dev->vendor_id;
++			*device_id = dev->device_id;
++			pci_lookup_name(pacc, pci_name, len,
++					PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE,
++					dev->vendor_id, dev->device_id);
++			break;
++		}
++	}
++
++free:
++	pci_cleanup(pacc);
++}
++
+ int ras_aer_event_handler(struct trace_seq *s,
+ 			  struct tep_record *record,
+ 			  struct tep_event *event, void *context)
+@@ -75,7 +115,8 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 	time_t now;
+ 	struct tm *tm;
+ 	struct ras_aer_event ev;
+-	char buf[BUF_LEN];
++	char buf[BUF_LEN] = { 0 };
++	uint16_t vendor_id = 0, device_id = 0;
+ #ifdef HAVE_AMP_NS_DECODE
+ 	char ipmi_add_sel[105];
+ 	uint8_t sel_data[5];
+@@ -108,6 +149,9 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 		return -1;
+ 	trace_seq_printf(s, "%s ", ev.dev_name);
+ 
++	get_pci_dev_name(ev.dev_name, buf, sizeof(buf), &vendor_id, &device_id);
++	trace_seq_printf(s, "(%s - vendor_id: %#x device_id: %#x) ", buf, vendor_id, device_id);
++
+ 	if (tep_get_field_val(s,  event, "status", record, &status_val, 1) < 0)
+ 		return -1;
+ 
+diff --git a/ras-record.h b/ras-record.h
+index eb5b838..ce7d12c 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -43,7 +43,7 @@ struct ras_mc_offline_event {
+ struct ras_aer_event {
+ 	char timestamp[64];
+ 	const char *error_type;
+-	const char *dev_name;
++	char *dev_name;
+ 	uint8_t tlp_header_valid;
+ 	uint32_t *tlp_header;
+ 	const char *msg;
+-- 
+2.43.5
+
diff --git a/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b5a6922e71e6063526fabf05ebbf639d9d7add4f
--- /dev/null
+++ b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch
@@ -0,0 +1,332 @@
+From 921765e3ccd8333c5474000e409dfb0ec80c8f32 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 27 Mar 2025 17:45:16 +0800
+Subject: [PATCH 10/30] rasdaemon: introduce EDPC config in rasdaemon
+
+System with EDPC enabled device can recovery from fatal aer error.
+Rasdaemon now helps users correctly configure EDPC functionality.
+
+Rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set
+to 1. All device with EDPC capability will be enabled by default
+if EDPC_DEVICE is specified, only the specified device will be
+enabled. For example:
+	PCIE_EDPC_ENABLE=1
+	EDPC_DEVICE=0000:01:00.0
+only enable device 0000:01:00.0.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |   4 +-
+ misc/rasdaemon.env |  11 +++
+ ras-pcie-edpc.c    | 217 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-pcie-edpc.h    |   9 ++
+ rasdaemon.c        |   5 ++
+ 5 files changed, 244 insertions(+), 2 deletions(-)
+ create mode 100644 ras-pcie-edpc.c
+ create mode 100644 ras-pcie-edpc.h
+
+diff --git a/Makefile.am b/Makefile.am
+index 2911a21..bb3d420 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -56,7 +56,7 @@ if WITH_SQLITE3
+    rasdaemon_SOURCES += ras-record.c
+ endif
+ if WITH_AER
+-   rasdaemon_SOURCES += ras-aer-handler.c
++   rasdaemon_SOURCES += ras-aer-handler.c  ras-pcie-edpc.c
+ endif
+ if WITH_NON_STANDARD
+    rasdaemon_SOURCES += ras-non-standard-handler.c
+@@ -133,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+ 		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \
+-		  ras-poison-page-stat.h ras-erst.h
++		  ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 193ee19..0516c9c 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -101,3 +101,14 @@ MC_CE_STAT_THRESHOLD=2000
+ POISON_STAT_THRESHOLD=102400
+ 
+ ERST_DELETE=1
++
++# EDPC config
++#
++# rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set to 1
++# All device with EDPC capability will be enabled by default,
++# if EDPC_DEVICE is specified, only the specified device will be enabled
++# For example:
++#   PCIE_EDPC_ENABLE=1
++#   EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0
++PCIE_EDPC_ENABLE=0
++EDPC_DEVICE=
+diff --git a/ras-pcie-edpc.c b/ras-pcie-edpc.c
+new file mode 100644
+index 0000000..4731b05
+--- /dev/null
++++ b/ras-pcie-edpc.c
+@@ -0,0 +1,217 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include <pci/pci.h>
++#include <linux/pci_regs.h>
++#include <stdbool.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include "ras-pcie-edpc.h"
++#include "ras-logger.h"
++#include "types.h"
++
++#define EDPC_DEVICE "EDPC_DEVICE"
++
++#define PCI_EXP_DPC_CTL_EN_MASK	0x3
++
++static char *edpc_str[] = {
++	[PCI_EXP_DPC_CTL_EN_FATAL] = "Fatal Error",
++	[PCI_EXP_DPC_CTL_EN_NONFATAL] = "Non-Fatal Error",
++};
++
++static bool is_cxl_mem_or_cache(struct pci_dev *dev)
++{
++	struct pci_cap *cap;
++	u32 hdr;
++	u16 vendor, cxl_cap, id;
++
++	cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DVSEC, PCI_CAP_EXTENDED);
++	if (!cap)
++		return false;
++
++	hdr = pci_read_long(dev, cap->addr + PCI_DVSEC_HEADER1);
++	vendor = hdr & GENMASK(15, 0);
++	id = pci_read_word(dev, cap->addr + PCI_DVSEC_HEADER2);
++	if (vendor != PCI_DVSEC_VENDOR_ID_CXL || id != PCI_DVSEC_ID_CXL)
++		return false;
++
++	cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_CAP);
++	if (cxl_cap & (PCI_CXL_CAP_CACHE | PCI_CXL_CAP_MEM))
++		return true;
++
++	return false;
++}
++
++/**
++ * CXL 2.0 RAS spec: 4.2:
++ * Enabling eDPC is not recommended in most CXL 2.0 systems because eDPC
++ * containment flow brings the link down, disrupting CXL.cache and
++ * CXL.mem traffic which can lead to host timeouts.
++ */
++static void cxl_check_rp(struct pci_dev *dev, struct pci_dev *dpc)
++{
++	struct pci_dev *dev_p, *dpc_p;
++	for (dev_p = dev->parent; dev_p; dev_p = dev_p->parent) {
++		for (dpc_p = dpc->next; dpc_p; dpc_p = dpc_p->next) {
++			if (dev_p->domain == dpc_p->domain &&
++			    dev_p->bus == dpc_p->bus &&
++			    dev_p->dev == dpc_p->dev &&
++			    dev_p->func == dpc_p->func) {
++				dpc_p->aux = (void *)true;
++				log(TERM, LOG_INFO, "Device %x:%x:%x.%x is CXL RP, ignore EDPC config\n",
++					dpc_p->domain, dpc_p->bus, dpc_p->dev, dpc_p->func);
++			    }
++		}
++	}
++}
++
++static bool has_edpc(struct pci_dev *dev)
++{
++	struct pci_cap *cap;
++
++	pci_fill_info(dev, PCI_FILL_EXT_CAPS);
++	cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED);
++	if (!cap)
++		return false;
++	return true;
++}
++
++static void set_edpc(struct pci_dev *dev)
++{
++	struct pci_cap *cap;
++	u16 control;
++	int need_config = 0;
++
++	cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED);
++	if (!cap)
++		return;
++
++	control = pci_read_word(dev, cap->addr + PCI_EXP_DPC_CTL);
++	need_config = PCI_DPC_CTL_TRIGGER(control) == PCI_EXP_DPC_CTL_EN_FATAL ? 0 : 1;
++	log(TERM, LOG_INFO, "Device %x:%x:%x.%x origin EDPC %s and triggered for %s, %s need config\n",
++	    dev->domain, dev->bus, dev->dev, dev->func,
++	    (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled",
++	    edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK],
++	    need_config ? "" : "not");
++
++	if (need_config) {
++		control &= PCI_EXP_DPC_CTL_EN_MASK;
++		control |= PCI_EXP_DPC_CTL_EN_FATAL;
++		pci_write_word(dev, cap->addr + PCI_EXP_DPC_CTL, control);
++		log(TERM, LOG_INFO, "Device %x:%x:%x.%x EDPC %s and triggered for %s\n",
++		    dev->domain, dev->bus, dev->dev, dev->func,
++		    (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled",
++		    edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK]);
++	}
++}
++
++static struct pci_filter *config_pcie_edpc_device(struct pci_access *pacc, char *names, int *len)
++{
++	int i;
++	struct pci_filter *filter = NULL;
++	char *token, *err, pci_names[MAX_PATH + 1];
++
++	strscpy(pci_names, names, sizeof(pci_names));
++	for (i = 0; pci_names[i] != '\0'; i++)
++		if (pci_names[i] == ',')
++			(*len)++;
++
++	filter = calloc(*len, sizeof(struct pci_filter));
++	if (!filter)
++		return NULL;
++
++	i = 0;
++	token = strtok(pci_names, ",");
++	while (token) {
++		pci_filter_init(pacc, &filter[i]);
++		err = pci_filter_parse_slot(&filter[i++], token);
++		if (err) {
++			free(filter);
++			log(TERM, LOG_ERR, "Invalid PCI device name %s\n", err);
++			return NULL;
++		}
++		token = strtok(NULL, ",");
++	}
++
++	log(TERM, LOG_ERR, "Config PCIE EDPC for: %s\n", names);
++
++	return filter;
++}
++
++int config_pcie_edpc(void)
++{
++	struct pci_access *pacc;
++	struct pci_dev *dev, *dev_head, *tmp;
++	int ret = 0, len = 1, i;
++	char *pci_names;
++	struct pci_filter *filter = NULL;
++	struct pci_dev dev_dpc_head = { 0 };
++
++	pacc = pci_alloc();
++	if (!pacc)
++		return -1;
++
++	pci_init(pacc);
++	pci_scan_bus(pacc);
++
++	pci_names = getenv(EDPC_DEVICE);
++	if (pci_names && strlen(pci_names) != 0) {
++		filter = config_pcie_edpc_device(pacc, pci_names, &len);
++		if (!filter)
++			goto free;
++	} else {
++		len = 0;
++	}
++
++	dev_head = pacc->devices;
++	for (dev = dev_head; dev; dev = dev->next) {
++		pci_fill_info(dev, PCI_FILL_PARENT);
++		if (has_edpc(dev)) {
++			tmp = malloc(sizeof(struct pci_dev));
++			if (!tmp) {
++				ret = -1;
++				goto free;
++			}
++
++			memcpy(tmp, dev, sizeof(struct pci_dev));
++			tmp->next = dev_dpc_head.next;
++			dev_dpc_head.next = tmp;
++		}
++	}
++
++	for (dev = dev_head; dev; dev = dev->next)
++		if (is_cxl_mem_or_cache(dev))
++			cxl_check_rp(dev, &dev_dpc_head);
++
++	for (dev = dev_dpc_head.next; dev; dev = dev->next) {
++		if (!dev->aux) {
++			if (len) {
++				for (i = 0; i < len; i++) {
++					if (pci_filter_match(&filter[i], dev)) {
++						set_edpc(dev);
++						break;
++					}
++				}
++			} else {
++				set_edpc(dev);
++			}
++		}
++	}
++
++free:
++	while (dev_dpc_head.next) {
++		tmp = dev_dpc_head.next;
++		dev_dpc_head.next = tmp->next;
++		free(tmp);
++	}
++
++	pci_cleanup(pacc);
++	free(filter);
++	return ret;
++}
+diff --git a/ras-pcie-edpc.h b/ras-pcie-edpc.h
+new file mode 100644
+index 0000000..a7b96a4
+--- /dev/null
++++ b/ras-pcie-edpc.h
+@@ -0,0 +1,9 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++ #define PCIE_EDPC_ENABLE "PCIE_EDPC_ENABLE"
++
++int config_pcie_edpc(void);
+diff --git a/rasdaemon.c b/rasdaemon.c
+index be5c390..3d4c2ec 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -16,6 +16,7 @@
+ #include "ras-poison-page-stat.h"
+ #include "ras-record.h"
+ #include "ras-mc-handler.h"
++#include "ras-pcie-edpc.h"
+ #include "types.h"
+ 
+ /*
+@@ -235,6 +236,10 @@ int main(int argc, char *argv[])
+ 		handle_erst();
+ #endif
+ #endif
++	if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE)))
++		config_pcie_edpc();
++	else
++		log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n");
+ 
+ 	handle_ras_events(args.record_events, args.enable_ipmitool);
+ 
+-- 
+2.43.5
+
diff --git a/1011-rasdaemon-support-nvgpu-event.patch b/1011-rasdaemon-support-nvgpu-event.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b2ca511c696647fac28185033b4d7b564edf78f0
--- /dev/null
+++ b/1011-rasdaemon-support-nvgpu-event.patch
@@ -0,0 +1,511 @@
+From 0696914f490288081325b2a4425de1f0d45c4554 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 11 Apr 2025 13:30:10 +0800
+Subject: [PATCH 11/30] rasdaemon: support nvgpu event
+
+Use nvml library to report nvgpu event. New environment
+NVGPU_DISABLE_EVENT indicate registered events.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |  13 +++-
+ configure.ac       |  11 +++
+ contrib/nvml.py    |  77 ++++++++++++++++++++
+ misc/rasdaemon.env |   7 ++
+ ras-nvgpu-nvml.c   | 178 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-nvgpu.c        |  54 ++++++++++++++
+ ras-nvgpu.h        |  14 ++++
+ rasdaemon.c        |  27 +++++++
+ 9 files changed, 380 insertions(+), 2 deletions(-)
+ create mode 100644 contrib/nvml.py
+ create mode 100644 ras-nvgpu-nvml.c
+ create mode 100644 ras-nvgpu.c
+ create mode 100644 ras-nvgpu.h
+
+diff --git a/Makefile.am b/Makefile.am
+index bb3d420..58ac082 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -17,10 +17,12 @@ EXTRA_DIST = \
+ 	$(RSYSLOG_SERVICES_IN) \
+ 	$(LOGROTATE_SERVICES_IN) \
+ 	misc/rasdaemon.env \
++	contrib/nvml.py \
+ 	contrib/mc_event_trigger \
+ 	contrib/mem_fail_trigger
+ 
+ CLEANFILES= \
++	ras-nvgpu-nvml.h \
+ 	misc/ras-mc-ctl.service	\
+ 	misc/rasdaemon.service \
+ 	misc/rasdaemon.syslog-ng \
+@@ -123,7 +125,14 @@ if WITH_ERST
+    rasdaemon_SOURCES += ras-erst.c
+ endif
+ 
+-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS)
++if WITH_NVGPU
++   BUILT_SOURCES = ras-nvgpu-nvml.h
++ras-nvgpu-nvml.h: contrib/nvml.py
++	python3 $< > $@
++   rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c
++endif
++
++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl
+ rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS)
+ 
+ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+@@ -133,7 +142,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+ 		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \
+-		  ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h
++		  ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/configure.ac b/configure.ac
+index 3603c7f..43d845d 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -278,6 +278,16 @@ AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [
+ AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"])
+ 
++AC_ARG_ENABLE([nvgpu],
++    AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events]))
++
++AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect")
++  AC_SUBST([WITH_NVGPU])
++])
++AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes])
++AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"])
++
+ test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
+ 
+ CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
+@@ -326,4 +336,5 @@ compile time options summary
+     JAGUAR RAS errors   : $USE_JAGUAR_NS_DECODE
+     Signal              : $USE_SIGNAL
+     ERST                : $USE_ERST
++    NVGPU RAS errors    : $USE_NVGPU
+ EOF
+diff --git a/contrib/nvml.py b/contrib/nvml.py
+new file mode 100644
+index 0000000..9f2c57d
+--- /dev/null
++++ b/contrib/nvml.py
+@@ -0,0 +1,77 @@
++import re
++
++PATH="/usr/local/cuda/include/nvml.h"
++func = ["nvmlInit",
++        "nvmlDeviceGetSupportedEventTypes",
++        "nvmlDeviceRegisterEvents",
++        "nvmlEventSetCreate",
++        "nvmlEventSetWait",
++        "nvmlDeviceGetCount",
++        "nvmlDeviceGetHandleByIndex",
++        "nvmlDeviceGetPciInfo",
++        "nvmlEventSetFree",
++        "nvmlShutdown"]
++
++pattern = re.compile(
++        r'^nvmlReturn_t DECLDIR\s+({})(\(.*?\));'.format('|'.join(map(re.escape, func))),
++        flags=re.MULTILINE
++)
++
++type_pattern = re.compile(
++        r'^#define\s+nvmlEventType(\w+)\s+0x.*',
++        flags=re.MULTILINE
++)
++
++with open(PATH, 'r') as file:
++        content = file.read()
++        matched_lines = pattern.findall(content)
++        type_lines = type_pattern.findall(content)
++
++func_declares = []
++func_defs = []
++func_inits = []
++type_strs = []
++
++for match in matched_lines:
++        func_declares.append('typedef nvmlReturn_t (*my_{}_p){};'.format(match[0], match[1]))
++        func_defs.append('my_{}_p my_{};'.format(match[0], match[0]))
++        func_inits.append('my_{0} = (my_{0}_p)dlsym(handle, "{0}"); \
++                                \n\tif (!my_{0}) {{ \
++                                \n\t\tprintf(\"Failed to load {0}: %s\\n\", dlerror()); \
++                                \n\t\treturn -1; \
++                                \n\t}}'.format(match[0]))
++
++for type_line in type_lines:
++        type_strs.append('case nvmlEventType{}: return \"{}\";'.format(type_line, type_line))
++
++print('''
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++'''
++)
++print('#include <dlfcn.h>\
++        \n#include <stdio.h>\
++        \n#include "/usr/local/cuda/include/nvml.h"')
++print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);')
++print('\n'.join(func_declares))
++print('\nmy_nvmlErrorString_p my_nvmlErrorString;')
++print('\n'.join(func_defs))
++print('\nstatic int my_nvml_setup(void* handle) \n{{\n\t{}{}\n\treturn 0;\n}}'.format('\n\t'.join(func_inits),
++        '\n\tmy_nvmlErrorString = (my_nvmlErrorString_p)dlsym(handle, "nvmlErrorString"); \
++         \n\tif (!my_nvmlErrorString) { \
++         \n\t\tprintf(\"Failed to load nvmlErrorString: %s\\n\", dlerror()); \
++         \n\t\treturn -1; \
++         \n\t}'))
++print('\nstatic const char* my_nvmlEventTypeString(unsigned long long type) \n{{ \
++        \n\n\tswitch (type) {{ \
++        \n\t{} \
++        \n\tdefault: return \"Unknown\"; \
++        \n\t}} \
++        \n\treturn \"Unknown\"; \
++        \n}}'.format('\n\t'.join(type_strs)))
++
++
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 0516c9c..60544f7 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -112,3 +112,10 @@ ERST_DELETE=1
+ #   EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0
+ PCIE_EDPC_ENABLE=0
+ EDPC_DEVICE=
++
++# Registered event type for nvgpu, default is
++# nvmlEventTypeAll & ~nvmlEventTypeClock
++# ref: https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html
++# For example:
++#   NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock
++NVGPU_DISABLE_EVENT="0x10"
+diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c
+new file mode 100644
+index 0000000..aabe8f9
+--- /dev/null
++++ b/ras-nvgpu-nvml.c
+@@ -0,0 +1,178 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include <time.h>
++#include <unistd.h>
++#include <stdlib.h>
++
++#include "ras-logger.h"
++#include "ras-nvgpu-nvml.h"
++#include "ras-nvgpu.h"
++#include "trace-seq.h"
++#include "types.h"
++
++#define XID_EVENT_NAME "xid"
++
++const char *lib_name[] = {
++	"/lib64/libnvidia-ml.so",
++	"/lib64/libnvidia-ml.so.1",
++	"/usr/local/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so",
++	"/usr/local/cuda/targets/sbsa-linux/lib/stubs/libnvidia-ml.so"
++};
++
++static void *find_lib(void)
++{
++	void *handle = NULL;
++
++	for (int i = 0; i < ARRAY_SIZE(lib_name); i++) {
++		handle = dlopen(lib_name[i], RTLD_LAZY);
++		if (handle)
++			return handle;
++	}
++
++	log(ALL, LOG_ERR, "Failed to load libnvidia-ml\n");
++	return NULL;
++}
++
++static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
++{
++	struct trace_seq s;
++	nvmlPciInfo_t pci;
++	time_t now;
++	struct tm *tm;
++	char timestamp[64];
++
++	time(&now);
++	tm = localtime(&now);
++
++	if (tm)
++		strftime(timestamp, sizeof(timestamp),
++			 "%Y-%m-%d %H:%M:%S %z", tm);
++
++	my_nvmlDeviceGetPciInfo(data->device, &pci);
++
++	trace_seq_init(&s);
++	if (data->eventType == nvmlEventTypeXidCriticalError) {
++		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
++			"<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME);
++		trace_seq_printf(&s, "%s ", timestamp);
++		trace_seq_printf(&s, "xid: %lld ", data->eventData);
++	} else {
++		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
++			"<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME);
++		trace_seq_printf(&s, "%s ", timestamp);
++		trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType);
++		trace_seq_printf(&s, "data: %lld ", data->eventData);
++	}
++
++	trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci));
++	trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId);
++	trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId);
++
++	trace_seq_terminate(&s);
++	trace_seq_do_printf(&s);
++	printf("\n");
++	fflush(stdout);
++	trace_seq_destroy(&s);
++
++	return 0;
++}
++
++int ras_nvgpu_nvml_handle(void)
++{
++	void *nvml_handle;
++	nvmlReturn_t ret;
++	unsigned int device_count;
++	nvmlDevice_t *devices;
++	nvmlEventSet_t event_set;
++	char *event_types_str = NULL;
++	unsigned long long disable = 0, event_types = 0;
++	nvmlEventData_t event_data;
++
++	nvml_handle = find_lib();
++	if (!nvml_handle) {
++		log(ALL, LOG_ERR, "Failed to load libnvidia-ml: %s\n", dlerror());
++		return 1;
++	}
++
++	if (my_nvml_setup(nvml_handle)) {
++		log(ALL, LOG_ERR, "Failed to setup libnvidia-ml\n");
++		dlclose(nvml_handle);
++		return 1;
++	}
++
++	ret = my_nvmlInit();
++	if (ret) {
++		log(ALL, LOG_ERR, "NVML Init failed: %s\n", my_nvmlErrorString(ret));
++		goto free_dl;
++	}
++
++	ret = my_nvmlDeviceGetCount(&device_count);
++	if (ret) {
++		log(ALL, LOG_ERR, "Get device count failed: %s\n", my_nvmlErrorString(ret));
++		goto free_nvml;
++	}
++
++	devices = malloc(device_count * sizeof(nvmlDevice_t));
++	if (!devices) {
++		log(ALL, LOG_ERR, "Failed to allocate memory for devices\n");
++		goto free_nvml;
++	}
++
++	for (unsigned int i = 0; i < device_count; i++) {
++		ret = my_nvmlDeviceGetHandleByIndex(i, &devices[i]);
++		if (ret) {
++			log(ALL, LOG_ERR, "Get device handle failed: %s\n", my_nvmlErrorString(ret));
++			goto free_dev;
++		}
++	}
++
++	ret = my_nvmlEventSetCreate(&event_set);
++	if (ret) {
++		log(ALL, LOG_ERR, "Create event set failed: %s\n", my_nvmlErrorString(ret));
++		goto free_dev;
++	}
++
++	event_types_str = getenv("NVGPU_DISABLE_EVENT");
++	if (event_types_str) {
++		disable = strtoull(event_types_str, NULL, 0);
++		log(ALL, LOG_INFO, "Disable NVGPU events %s\n", my_nvmlEventTypeString(disable));
++	}
++
++	for (unsigned int i = 0; i < device_count; i++) {
++		ret = my_nvmlDeviceGetSupportedEventTypes(devices[i], &event_types);
++		if (ret) {
++			log(ALL, LOG_ERR, "Get support events failed: %s\n", my_nvmlErrorString(ret));
++			goto free_event;
++		}
++
++		ret = my_nvmlDeviceRegisterEvents(devices[i], event_types & ~disable, event_set);
++		if (ret) {
++			log(ALL, LOG_ERR, "Register events failed: %s\n", my_nvmlErrorString(ret));
++			goto free_event;
++		}
++	}
++
++	while (1) {
++		ret = my_nvmlEventSetWait(event_set, &event_data, -1);
++		if (!ret)
++			report_ras_gpu_nvml(&event_data, devices);
++		else {
++			log(ALL, LOG_ERR, "Wait for event failed: %s\n", my_nvmlErrorString(ret));
++			break;
++		}
++	}
++
++free_event:
++	my_nvmlEventSetFree(event_set);
++free_dev:
++	free(devices);
++free_nvml:
++	my_nvmlShutdown();
++free_dl:
++	dlclose(nvml_handle);
++	return ret;
++}
+diff --git a/ras-nvgpu.c b/ras-nvgpu.c
+new file mode 100644
+index 0000000..5c63279
+--- /dev/null
++++ b/ras-nvgpu.c
+@@ -0,0 +1,54 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include <errno.h>
++#include <pthread.h>
++#include <stdio.h>
++#include <signal.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <unistd.h>
++
++#include "ras-events.h"
++#include "ras-logger.h"
++#include "ras-nvgpu.h"
++void *ras_nvgpu_handle(void *arg)
++{
++	(void)arg;
++	sigset_t set;
++	struct stat st;
++	int retry = 3;
++
++	if (stat("/dev/nvidia0", &st) == -1) {
++		log(ALL, LOG_WARNING, "NVIDIA device not found: %s\n", strerror(errno));
++		return NULL;
++	}
++	if (!S_ISCHR(st.st_mode)) {
++		log(ALL, LOG_WARNING, "NVIDIA device is not a character device\n");
++		return NULL;
++	}
++
++	sigemptyset(&set);
++	sigaddset(&set, SIGINT);
++	sigaddset(&set, SIGTERM);
++	sigaddset(&set, SIGHUP);
++	sigaddset(&set, SIGQUIT);
++	if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) {
++		log(ALL, LOG_ERR, "Failed to set thread signal mask\n");
++		return NULL;
++	}
++
++	while (retry--) {
++		if (ras_nvgpu_nvml_handle()) {
++			log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry);
++			sleep(10);
++		}
++	}
++
++	log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n");
++
++	return NULL;
++}
+diff --git a/ras-nvgpu.h b/ras-nvgpu.h
+new file mode 100644
+index 0000000..32827ad
+--- /dev/null
++++ b/ras-nvgpu.h
+@@ -0,0 +1,14 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#ifndef __RAS_NVGPU_H
++#define __RAS_NVGPU_H
++
++#define NVGPU_EVENT_NAME "nvgpu"
++
++void *ras_nvgpu_handle(void *arg);
++int ras_nvgpu_nvml_handle(void);
++#endif
+diff --git a/rasdaemon.c b/rasdaemon.c
+index 3d4c2ec..9c5f9dd 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -5,6 +5,7 @@
+  */
+ 
+ #include <argp.h>
++#include <pthread.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+@@ -17,6 +18,7 @@
+ #include "ras-record.h"
+ #include "ras-mc-handler.h"
+ #include "ras-pcie-edpc.h"
++#include "ras-nvgpu.h"
+ #include "types.h"
+ 
+ /*
+@@ -241,7 +243,32 @@ int main(int argc, char *argv[])
+ 	else
+ 		log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n");
+ 
++#ifdef HAVE_NVGPU
++	pthread_t nvgpu_thread = 0, main_thread = pthread_self();
++	bool nvgpu_enable = true;
++
++	if (choices_disable && strlen(choices_disable) != 0 &&
++	    strstr(choices_disable, NVGPU_EVENT_NAME)) {
++		nvgpu_enable = false;
++		log(ALL, LOG_INFO, "Disable nvgpu event.\n");
++	}
++
++	if (nvgpu_enable) {
++		if (pthread_create(&nvgpu_thread, NULL, ras_nvgpu_handle, &main_thread) != 0) {
++			log(ALL, LOG_ERR, "Failed to create XID thread\n");
++			pthread_cancel(nvgpu_thread);
++			exit(EXIT_FAILURE);
++		}
++		pthread_detach(nvgpu_thread);
++		log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n");
++	}
++#endif
+ 	handle_ras_events(args.record_events, args.enable_ipmitool);
+ 
++#ifdef HAVE_NVGPU
++	if (nvgpu_enable)
++		pthread_cancel(nvgpu_thread);
++#endif
++
+ 	return 0;
+ }
+-- 
+2.43.5
+
diff --git a/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4292bb075257cf6d4c26d11b711444c40a51635d
--- /dev/null
+++ b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch
@@ -0,0 +1,937 @@
+From 9163f3cd0f9344aacf8eb4b061f3ea2269f6c0cb Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 7 Jun 2024 11:26:06 +0800
+Subject: [PATCH 12/30] rasdaemon: enhance rasdaemon event trigger
+
+- Add trigger timeout to avoid trigger hang.
+- Move all trigger code to trigger.c
+
+Use $(TRIGGER_NAME)_TIMEOUT to set trigger timeout val, for example:
+
+MC_CE_TRIGGER: The script executed when corrected mc_event occurs.
+MC_CE_TRIGGER_TIMEOUT: Timeout(seconds) for MC_CE_TRIGGER, set 0 to
+delete timeout.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                  |   6 +-
+ contrib/aer_trigger          |  27 +++
+ contrib/mc_event_trigger     |   9 +
+ contrib/mce_record_trigger   |  46 +++++
+ contrib/mem_fail_trigger     |  21 +-
+ misc/rasdaemon.env           |  23 ++-
+ ras-aer-handler.c            |   3 +
+ ras-events.c                 |  18 --
+ ras-mc-handler.c             |  89 +--------
+ ras-mce-handler.c            |   3 +
+ ras-memory-failure-handler.c |  55 +----
+ trigger.c                    | 376 ++++++++++++++++++++++++++++++++---
+ trigger.h                    |  19 +-
+ 13 files changed, 493 insertions(+), 202 deletions(-)
+ create mode 100755 contrib/aer_trigger
+ create mode 100755 contrib/mce_record_trigger
+
+diff --git a/Makefile.am b/Makefile.am
+index 58ac082..72f30b4 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -18,8 +18,7 @@ EXTRA_DIST = \
+ 	$(LOGROTATE_SERVICES_IN) \
+ 	misc/rasdaemon.env \
+ 	contrib/nvml.py \
+-	contrib/mc_event_trigger \
+-	contrib/mem_fail_trigger
++	contrib/*_trigger
+ 
+ CLEANFILES= \
+ 	ras-nvgpu-nvml.h \
+@@ -171,8 +170,6 @@ install-data-local:
+ 	$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
+ 	$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
+ 	install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
+-	$(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
+-	$(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger"
+ 	if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \
+ 	fi
+@@ -182,3 +179,4 @@ install-data-local:
+ 	if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \
+ 	fi
++	$(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/"
+diff --git a/contrib/aer_trigger b/contrib/aer_trigger
+new file mode 100755
+index 0000000..87f9da9
+--- /dev/null
++++ b/contrib/aer_trigger
+@@ -0,0 +1,27 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occurred, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# TIMESTAMP             Timestamp when error occurred
++# ERROR_TYPE            Corrected | Uncorrected (Non-Fatal) | Uncorrected (Fatal)
++# DEV_NAME              BDF
++# TLP_HEADER_VALID
++# TLP_HEADER
++# MSG
++#
++
++[ -x ./aer_trigger.local ] && . ./aer_trigger.local
++
++if [ -d aer_trigger.extern ]
++then
++    ls aer_trigger.extern |
++    while read item
++    do
++        [ -x ./aer_trigger.extern/$item ] && . ./aer_trigger.extern/$item
++    done
++fi
++
++exit 0
+diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger
+index 9862595..5c68b56 100755
+--- a/contrib/mc_event_trigger
++++ b/contrib/mc_event_trigger
+@@ -23,4 +23,13 @@
+ 
+ [ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local
+ 
++if [ -d mc_event_trigger.extern ]
++then
++    ls mc_event_trigger.extern |
++    while read item
++    do
++        [ -x ./mc_event_trigger.extern/$item ] && . ./mc_event_trigger.extern/$item
++    done
++fi
++
+ exit 0
+diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger
+new file mode 100755
+index 0000000..ca49e6d
+--- /dev/null
++++ b/contrib/mce_record_trigger
+@@ -0,0 +1,46 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  mc_event is occurred, environment variables include all information
++#  reported by tracepoint.
++#
++# environment:
++# MCGCAP        MCGCAP MSR: machine check capabilities of CPU
++# MCGSTATUS     Machine Check Global Status MSR
++# STATUS        Bank's MCi_STATUS MSR
++# ADDR          Bank's MCi_ADDR MSR
++# MISC          Bank's MCi_MISC MSR
++# IP            Instruction Pointer when the error happened
++# TSC           CPU time stamp counter
++# WALLTIME      Wall time_t when error was detected
++# CPU           CPU number; obsoleted by extcpu
++# CPUID         CPUID 1 EAX
++# APICID        CPU initial APIC ID
++# SOCKETID      CPU socket ID
++# CS            Code segment
++# BANK          Machine check bank reporting the error
++# CPUVENDOR     Kernel's X86_VENDOR enum
++# SYND          MCA_SYND MSR: only valid on SMCA systems
++# IPID          MCA_IPID MSR: only valid on SMCA systems
++# TIMESTAMP     Rasdaemon timestamp
++# BANK_NAME     Decode ban name
++# ERROR_MSG     Vendor define error message
++# MCGSTATUS_MSG Decode mcgstatus
++# MCISTATUS_MSG Decode mcistatus
++# MCASTATUS_MSG Decode mcastatus
++# USER_ACTION   Recommendations for actions users should take
++# MC_LOCATION   Error location in MC
++#
++
++[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local
++
++if [ -d mce_record_trigger.extern ]
++then
++    ls mce_record_trigger.extern |
++    while read item
++    do
++        [ -x ./mce_record_trigger.extern/$item ] && . ./mce_record_trigger.extern/$item
++    done
++fi
++
++exit 0
+diff --git a/contrib/mem_fail_trigger b/contrib/mem_fail_trigger
+index d75ce50..f63df91 100755
+--- a/contrib/mem_fail_trigger
++++ b/contrib/mem_fail_trigger
+@@ -1,14 +1,25 @@
+ #!/bin/sh
+ # SPDX-License-Identifier: GPL-2.0
+-#
+ #  This shell script can be executed by rasdaemon in daemon mode when a
+ #  memory_failure_event is occured, environment variables include all
+ #  information reported by tracepoint.
++
++# environment:
++# TIMESTAMP             Timestamp when error occurred
++# PFN                   Offlined page PFN
++# PAGE_TYPE             Page type
++# ACTION_RESULT         Action result
+ #
+ 
+-echo TIMESTAMP: $TIMESTAMP
+-echo PFN: $PFN
+-echo PAGE_TYPE: $PAGE_TYPE
+-echo ACTION_RESULT: $ACTION_RESULT
++[ -x ./mf_trigger.local ] && . ./mf_trigger.local
++
++if [ -d mf_trigger.extern ]
++then
++    ls mf_trigger.extern |
++    while read item
++    do
++        [ -x ./mf_trigger.extern/$item ] && . ./mf_trigger.extern/$item
++    done
++fi
+ 
+ exit 0
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 60544f7..1f5da55 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -83,11 +83,30 @@ TRIGGER_DIR=
+ 
+ # Execute these triggers when the mc_event occured, the triggers will not
+ # be executed if the trigger is not specified.
++# You can set timeout for trigger, trigger thread will be killed if timeout.
++# The default timeout is 1, if you do not want any timeout, set it to 0.
+ # For example:
+-#   MC_CE_TRIGGER=mc_event_trigger
+ #   MC_UE_TRIGGER=mc_event_trigger
+-MC_CE_TRIGGER=
++#   MC_UE_TRIGGER_TIMEOUT=1
++
++# trigger for mc_event
+ MC_UE_TRIGGER=
++MC_UE_TRIGGER_TIMEOUT=0
++
++MCE_DE_TRIGGER=
++MCE_UE_TRIGGER=
++MCE_DE_TRIGGER_TIMEOUT=0
++MCE_UE_TRIGGER_TIMEOUT=0
++
++MF_TRIGGER=
++MF_TRIGGER_TIMEOUT=0
++
++AER_CE_TRIGGER=
++AER_UE_TRIGGER=
++AER_FATAL_TRIGGER=
++AER_CE_TRIGGER_TIMEOUT=0
++AER_UE_TRIGGER_TIMEOUT=0
++AER_FATAL_TRIGGER_TIMEOUT=0
+ 
+ # CE Statistic Threshold
+ #
+diff --git a/ras-aer-handler.c b/ras-aer-handler.c
+index 53acbc8..471ad9f 100644
+--- a/ras-aer-handler.c
++++ b/ras-aer-handler.c
+@@ -17,6 +17,7 @@
+ #include "ras-report.h"
+ #include "unified-sel.h"
+ #include "types.h"
++#include "trigger.h"
+ 
+ /* bit field meaning for correctable error */
+ static const char *aer_cor_errors[32] = {
+@@ -254,5 +255,7 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 			return -1;
+ #endif
+ 
++	run_aer_event_trigger(&ev);
++
+ 	return 0;
+ }
+diff --git a/ras-events.c b/ras-events.c
+index d42ed9f..06f9a37 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -54,13 +54,6 @@
+ 
+ char *choices_disable;
+ 
+-static const struct event_trigger event_triggers[] = {
+-	{ "mc_event", &mc_event_trigger_setup },
+-#ifdef HAVE_MEMORY_FAILURE
+-	{ "memory_failure_event", &mem_fail_event_trigger_setup },
+-#endif
+-};
+-
+ static int get_debugfs_dir(char *tracing_dir, size_t len)
+ {
+ 	FILE *fp;
+@@ -328,17 +321,6 @@ free_ras:
+ 	return 0;
+ }
+ 
+-static void setup_event_trigger(char *event)
+-{
+-	struct event_trigger trigger;
+-
+-	for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
+-		trigger = event_triggers[i];
+-		if (!strcmp(event, trigger.name))
+-			trigger.setup();
+-	}
+-}
+-
+ #ifdef HAVE_DISKERROR
+ #if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL)
+ /*
+diff --git a/ras-mc-handler.c b/ras-mc-handler.c
+index 7a18f73..a729d93 100644
+--- a/ras-mc-handler.c
++++ b/ras-mc-handler.c
+@@ -20,89 +20,6 @@
+ #include "trigger.h"
+ #include "types.h"
+ 
+-#define MAX_ENV 30
+-static const char *mc_ce_trigger = NULL;
+-static const char *mc_ue_trigger = NULL;
+-
+-void mc_event_trigger_setup(void)
+-{
+-	const char *trigger;
+-
+-	trigger = getenv("MC_CE_TRIGGER");
+-	if (trigger && strcmp(trigger, "")) {
+-		mc_ce_trigger = trigger_check(trigger);
+-
+-		if (!mc_ce_trigger) {
+-			log(ALL, LOG_ERR,
+-			    "Cannot access mc_event ce trigger `%s`\n",
+-			    trigger);
+-		} else {
+-			log(ALL, LOG_INFO,
+-			    "Setup mc_event ce trigger `%s`\n",
+-			    trigger);
+-		}
+-	}
+-
+-	trigger = getenv("MC_UE_TRIGGER");
+-	if (trigger && strcmp(trigger, "")) {
+-		mc_ue_trigger = trigger_check(trigger);
+-
+-		if (!mc_ue_trigger) {
+-			log(ALL, LOG_ERR,
+-			    "Cannot access mc_event ue trigger `%s`\n",
+-			    trigger);
+-		} else {
+-			log(ALL, LOG_INFO,
+-			    "Setup mc_event ue trigger `%s`\n",
+-			    trigger);
+-		}
+-	}
+-}
+-
+-static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
+-{
+-	char *env[MAX_ENV];
+-	int ei = 0;
+-	int i;
+-
+-	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
+-		goto free;
+-	env[ei] = NULL;
+-	assert(ei < MAX_ENV);
+-
+-	run_trigger(mc_trigger, NULL, env, "mc_event");
+-
+-free:
+-	for (i = 0; i < ei; i++)
+-		free(env[i]);
+-}
+-
+ static unsigned long long per_sec_ce_count;
+ unsigned long long mc_ce_stat_threshold;
+ static time_t cur;
+@@ -312,11 +229,7 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 	ras_report_mc_event(ras, &ev);
+ #endif
+ 
+-	if (mc_ce_trigger && !strcmp(ev.error_type, "Corrected"))
+-		run_mc_trigger(&ev, mc_ce_trigger);
+-
+-	if (mc_ue_trigger && !strcmp(ev.error_type, "Uncorrected"))
+-		run_mc_trigger(&ev, mc_ue_trigger);
++	run_mc_event_trigger(&ev);
+ 
+ 	return 0;
+ 
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index 3d8d97d..92c5339 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -17,6 +17,7 @@
+ #include "ras-mce-handler.h"
+ #include "ras-report.h"
+ #include "types.h"
++#include "trigger.h"
+ 
+ /*
+  * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code,
+@@ -598,5 +599,7 @@ int ras_mce_event_handler(struct trace_seq *s,
+ 	ras_report_mce_event(ras, &e);
+ #endif
+ 
++	run_mce_record_trigger(&e);
++
+ 	return 0;
+ }
+diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
+index d4c293b..0f4e937 100644
+--- a/ras-memory-failure-handler.c
++++ b/ras-memory-failure-handler.c
+@@ -87,59 +87,6 @@ static const struct {
+ 	{ MF_RECOVERED, "Recovered" },
+ };
+ 
+-#define MAX_ENV 6
+-static const char *mf_trigger = NULL;
+-
+-void mem_fail_event_trigger_setup(void)
+-{
+-	const char *trigger;
+-
+-	trigger = getenv("MEM_FAIL_TRIGGER");
+-	if (trigger && strcmp(trigger, "")) {
+-		mf_trigger = trigger_check(trigger);
+-
+-		if (!mf_trigger) {
+-			log(ALL, LOG_ERR,
+-			    "Cannot access memory_fail_event trigger `%s`\n",
+-			    trigger);
+-		} else {
+-			log(ALL, LOG_INFO,
+-			    "Setup memory_fail_event trigger `%s`\n",
+-			    trigger);
+-		}
+-	}
+-}
+-
+-static void run_mf_trigger(struct ras_mf_event *ev)
+-{
+-	char *env[MAX_ENV];
+-	int ei = 0;
+-	int i;
+-
+-	if (!mf_trigger)
+-		return;
+-
+-	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "PFN=%s", ev->pfn) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "PAGE_TYPE=%s", ev->page_type) < 0)
+-		goto free;
+-	if (asprintf(&env[ei++], "ACTION_RESULT=%s", ev->action_result) < 0)
+-		goto free;
+-
+-	env[ei] = NULL;
+-	assert(ei < MAX_ENV);
+-
+-	run_trigger(mf_trigger, NULL, env, "memory_fail_event");
+-
+-free:
+-	for (i = 0; i < ei; i++)
+-		free(env[i]);
+-}
+-
+ static const char *get_page_type(int page_type)
+ {
+ 	unsigned int i;
+@@ -222,7 +169,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
+ 	/* Report event to ABRT */
+ 	ras_report_mf_event(ras, &ev);
+ #endif
+-	run_mf_trigger(&ev);
++	run_mf_event_trigger(&ev);
+ 
+ 	return 0;
+ }
+diff --git a/trigger.c b/trigger.c
+index aa19a22..a13fffd 100644
+--- a/trigger.c
++++ b/trigger.c
+@@ -3,56 +3,378 @@
+ #define _GNU_SOURCE
+ #include <stdio.h>
+ #include <stdlib.h>
++#include <string.h>
+ #include <sys/wait.h>
+ #include <unistd.h>
+ 
+ #include "ras-logger.h"
++#include "types.h"
+ #include "trigger.h"
+ 
+-void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter)
++#include "ras-mce-handler.h"
++
++#define MAX_ENV 30
++static int child_done, alarm_done;
++static char *trigger_dir;
++
++static void child_handler(int sig)
++{
++	child_done = 1;
++}
++
++static void alarm_handler(int sig)
++{
++	alarm_done = 1;
++}
++
++void run_trigger(struct event_trigger *t, char *argv[], char **env)
+ {
+ 	pid_t child;
+-	int status;
++	char *trigger = t->path;
++	const char *path = t->abs_path;
++	int status, timeout = t->timeout;
+ 
+-	log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);
++	log(ALL, LOG_INFO, "Running trigger `%s' (reporter: %s)\n",
++	    trigger, t->event_name);
+ 
+ 	child = fork();
+ 	if (child < 0) {
+-		log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
++		log(ALL, LOG_ERR, "Cannot create process for trigger\n");
+ 		return;
++	} else if (child == 0) {
++		if (execve(path, argv, env) == -1)
++			log(ALL, LOG_ERR, "Trigger %s exec fail: %s\n", path, strerror(errno));
++		_exit(EXIT_FAILURE);
++	}
++
++	signal(SIGCHLD, child_handler);
++	if (timeout) {
++		signal(SIGALRM, alarm_handler);
++		alarm(timeout);
+ 	}
++	pause();
+ 
+-	if (child == 0) {
+-		execve(trigger, argv, env);
+-		_exit(127);
+-	} else {
+-		waitpid(child, &status, 0);
+-		if (WIFEXITED(status) && WEXITSTATUS(status)) {
+-			log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
+-			    trigger, WEXITSTATUS(status));
+-		} else if (WIFSIGNALED(status)) {
+-			log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
+-			    trigger, WTERMSIG(status));
++	if (child_done) {
++		if (waitpid(child, &status, WNOHANG) == child) {
++			if (WIFEXITED(status) && WEXITSTATUS(status))
++				log(ALL, LOG_INFO,
++				    "Trigger %s exited with status %d\n",
++				    trigger, WEXITSTATUS(status));
++			else if (WIFSIGNALED(status))
++				log(ALL, LOG_INFO,
++				    "Trigger %s killed by signal %d\n",
++				    trigger, WTERMSIG(status));
+ 		}
++		alarm(0);
++	} else if (alarm_done) {
++		log(ALL, LOG_WARNING, "Trigger timeout, kill it\n");
++		kill(child, SIGKILL);
+ 	}
++
++	signal(SIGCHLD, SIG_DFL);
++	signal(SIGALRM, SIG_DFL);
+ }
+ 
+-const char *trigger_check(const char *s)
++int trigger_check(struct event_trigger *t)
+ {
+-	char *name;
+-	int rc;
+-	char *trigger_dir = getenv("TRIGGER_DIR");
++	if (trigger_dir)
++		if (snprintf(t->abs_path, 256, "%s/%s", trigger_dir, t->path) < 0)
++			return -1;
++
++	return access(t->abs_path, R_OK | X_OK);
++}
++
++struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"};
++
++struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"};
++struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"};
+ 
+-	if (trigger_dir) {
+-		if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
+-			return NULL;
+-		s = name;
++struct event_trigger mf_trigger = {"memory_failure_event", "MEM_FAIL_TRIGGER"};
++
++struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"};
++struct event_trigger aer_ue_trigger = {"aer_event", "AER_UE_TRIGGER"};
++struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"};
++
++static struct event_trigger *event_triggers[] = {
++	&mc_ue_trigger,
++#ifdef HAVE_MCE
++	&mce_de_trigger,
++	&mce_ue_trigger,
++#endif
++#ifdef HAVE_MEMORY_FAILURE
++	&mf_trigger,
++#endif
++#ifdef HAVE_AER
++	&aer_ce_trigger,
++	&aer_ue_trigger,
++	&aer_fatal_trigger,
++#endif
++};
++
++void setup_event_trigger(const char *event)
++{
++	int i, j;
++	struct event_trigger *trigger;
++	char *s, timeout_env[64];
++
++	trigger_dir = getenv("TRIGGER_DIR");
++
++	for (i = 0; i < ARRAY_SIZE(event_triggers); i++) {
++		trigger = event_triggers[i];
++
++		if (strcmp(event, trigger->event_name))
++			continue;
++
++		s = getenv(trigger->env);
++		if (!s || !strcmp(s, ""))
++			continue;
++
++		trigger->path = s;
++		if (trigger_check(trigger)) {
++			log(ALL, LOG_ERR, "Cannot access trigger `%s`: %s\n", s, strerror(errno));
++			continue;
++		}
++
++		log(ALL, LOG_NOTICE, "Setup %s trigger `%s`\n", trigger->event_name, s);
++
++		snprintf(timeout_env, sizeof(timeout_env), "%s_TIMEOUT", trigger->env);
++
++		trigger->timeout = 1;
++		s = getenv(timeout_env);
++		if (!s || !strcmp(s, "")) {
++			log(ALL, LOG_NOTICE,
++			    "Setup %s trigger default timeout 1s\n",
++			    trigger->event_name);
++			continue;
++		}
++
++		j = atoi(s);
++		if (j < 0)
++			log(ALL, LOG_ERR,
++			    "Invalid %s trigger timeout `%d` use default value: 1s\n",
++			    trigger->event_name, j);
++		else if (j == 0) {
++			log(ALL, LOG_NOTICE,
++			    "%s trigger no timeout\n",
++			    trigger->event_name);
++			trigger->timeout = 0;
++		} else {
++			log(ALL, LOG_NOTICE,
++			    "Setup %s trigger timeout `%d`s\n",
++			    trigger->event_name, j);
++			trigger->timeout = j;
++		}
+ 	}
++}
++
++static void __run_mce_trigger(struct mce_event *e, struct event_trigger *trigger)
++{
++	char *env[MAX_ENV];
++	int ei = 0, i;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MCGCAP=%#lx", e->mcgcap) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MCGSTATUS=%#lx", e->mcgstatus) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "STATUS=%#lx", e->status) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ADDR=%#lx", e->addr) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MISC=%#lx", e->misc) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "IP=%#lx", e->ip) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TSC=%#lx", e->tsc) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "WALLTIME=%#lx", e->walltime) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "CPU=%#x", e->cpu) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "CPUID=%#x", e->cpuid) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "APICID=%#x", e->apicid) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "SOCKETID=%#x", e->socketid) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "CS=%#x", e->cs) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "BANK=%#x", e->bank) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "CPUVENDOR=%#x", e->cpuvendor) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "SYND=%#lx", e->synd) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "IPID=%#lx", e->ipid) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TIMESTAMP=%s", e->timestamp) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "BANK_NAME=%s", e->bank_name) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ERROR_MSG=%s", e->error_msg) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MCGSTATUS_MSG=%s", e->mcgstatus_msg) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MCISTATUS_MSG=%s", e->mcistatus_msg) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MCASTATUS_MSG=%s", e->mcastatus_msg) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "USER_ACTION=%s", e->user_action) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MC_LOCATION=%s", e->mc_location) < 0)
++		goto free;
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
+ 
+-	rc = access(s, R_OK | X_OK);
++	run_trigger(trigger, NULL, env);
+ 
+-	if (!rc)
+-		return(s);
++free:
++	for (i = 0; i < ei; i++)
++		free(env[i]);
++}
++
++void run_mce_record_trigger(struct mce_event *e)
++{
++	if (e->status & MCI_STATUS_UC)
++		__run_mce_trigger(e, &mce_ue_trigger);
++	else if (e->status & MCI_STATUS_DEFERRED)
++		__run_mce_trigger(e, &mce_de_trigger);
++}
++
++static void __run_mc_trigger(struct ras_mc_event *ev, struct event_trigger *trigger)
++{
++	char *env[MAX_ENV];
++	int ei = 0, i;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
++		goto free;
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (i = 0; i < ei; i++)
++		free(env[i]);
++}
++
++void run_mc_event_trigger(struct ras_mc_event *e)
++{
++	if (!strcmp(e->error_type, "Uncorrected"))
++		__run_mc_trigger(e, &mc_ue_trigger);
++}
++
++static void __run_mf_trigger(struct ras_mf_event *ev, struct event_trigger *trigger)
++{
++	char *env[MAX_ENV];
++	int ei = 0;
++	int i;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
+ 
+-	return NULL;
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "PFN=%s", ev->pfn) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "PAGE_TYPE=%s", ev->page_type) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ACTION_RESULT=%s", ev->action_result) < 0)
++		goto free;
++
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (i = 0; i < ei; i++)
++		free(env[i]);
++}
++
++void run_mf_event_trigger(struct ras_mf_event *e)
++{
++	__run_mf_trigger(e, &mf_trigger);
++}
++
++static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *trigger)
++{
++	char *env[MAX_ENV];
++	int ei = 0;
++	int i;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ERROR_TYPE=%s", ev->error_type) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DEV_NAME=%s", ev->dev_name) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "TLP_HEADER_VALID=%d", ev->tlp_header_valid) < 0)
++		goto free;
++	if (ev->tlp_header_valid)
++		if (asprintf(&env[ei++], "TLP_HEADER=%08x %08x %08x %08x",
++			     ev->tlp_header[0], ev->tlp_header[1],
++			     ev->tlp_header[2], ev->tlp_header[3]) < 0)
++			goto free;
++	if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0)
++		goto free;
++
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (i = 0; i < ei; i++)
++		free(env[i]);
++}
++
++void run_aer_event_trigger(struct ras_aer_event *e)
++{
++	if (!strcmp(e->error_type, "Corrected"))
++		__run_aer_trigger(e, &aer_ce_trigger);
++	else if (!strcmp(e->error_type, "Uncorrected (Non-Fatal)"))
++		__run_aer_trigger(e, &aer_ue_trigger);
++	else if (!strcmp(e->error_type, "Uncorrected (Fatal)"))
++		__run_aer_trigger(e, &aer_fatal_trigger);
+ }
+diff --git a/trigger.h b/trigger.h
+index 7d25042..31eff96 100644
+--- a/trigger.h
++++ b/trigger.h
+@@ -3,12 +3,23 @@
+ #ifndef __TRIGGER_H__
+ #define __TRIGGER_H__
+ 
++#include "ras-record.h"
++
+ struct event_trigger {
+-	const char *name;
+-	void (*setup)(void);
++	const char *event_name;
++	const char *env;
++	char *path;
++	char abs_path[256];
++	int timeout;
+ };
+ 
+-const char *trigger_check(const char *s);
+-void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter);
++int trigger_check(struct event_trigger *t);
++void run_trigger(struct event_trigger *t, char *argv[], char **envr);
++void setup_event_trigger(const char *event);
++
++void run_mc_event_trigger(struct ras_mc_event *e);
++void run_mce_record_trigger(struct mce_event *e);
++void run_mf_event_trigger(struct ras_mf_event *e);
++void run_aer_event_trigger(struct ras_aer_event *e);
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1013-rasdaemon-add-event-level-for-event-record.patch b/1013-rasdaemon-add-event-level-for-event-record.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f7a98e95140ad9d67d3b8b8ed5f3f7097e3b083a
--- /dev/null
+++ b/1013-rasdaemon-add-event-level-for-event-record.patch
@@ -0,0 +1,489 @@
+From 06f2f2a77aa546dcd5b0cb002869d08b8a016e5e Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 28 Mar 2025 13:19:47 +0800
+Subject: [PATCH] rasdaemon: add event level for event record
+
+To help users distinguish more and more events, this patch introduces
+event levels to indicate the severity of the current event to the
+system. Currently, three main levels are used: Alert, Crit, Error.
+Fatal events will be marked as "emerg" but in reality, the kernel
+will panic upon receiving a fatal event, so rasdaemon does not
+receive it.
+
+ALERT:	The uncorrected hardware error has been fixed, but cause
+	side effects.
+CRIT:	The uncorrected hardware error has been detected.
+ERROR:	The corrected hardware error has been detected.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                  |  2 +-
+ man/rasdaemon.1.in           | 15 +++++++++
+ ras-aer-handler.c            | 22 +++++++++++--
+ ras-arm-handler.c            |  2 ++
+ ras-cxl-handler.c            |  7 ++++
+ ras-devlink-handler.c        |  2 ++
+ ras-diskerror-handler.c      |  1 +
+ ras-extlog-handler.c         | 20 +++++++++++
+ ras-mc-handler.c             | 64 +++++++++++++++++++++++-------------
+ ras-mce-handler.c            |  9 +++++
+ ras-memory-failure-handler.c |  1 +
+ ras-nvgpu-nvml.c             |  4 +--
+ ras-page-isolation.c         |  5 +--
+ ras-poison-page-stat.c       |  4 +--
+ ras-signal-handler.c         |  4 +--
+ types.c                      | 18 ++++++++++
+ types.h                      | 11 +++++++
+ 17 files changed, 156 insertions(+), 35 deletions(-)
+ create mode 100644 types.c
+
+diff --git a/Makefile.am b/Makefile.am
+index 72f30b4..564a20d 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -52,7 +52,7 @@ all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTAT
+ 
+ sbin_PROGRAMS = rasdaemon
+ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
+-		    bitfield.c trigger.c
++		    bitfield.c trigger.c types.c
+ if WITH_SQLITE3
+    rasdaemon_SOURCES += ras-record.c
+ endif
+diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in
+index e884e55..2288fd9 100644
+--- a/man/rasdaemon.1.in
++++ b/man/rasdaemon.1.in
+@@ -72,6 +72,21 @@ environment variables. By default the config file is read from /etc/sysconfig/ra
+ 
+ The general format is environmentname=value.
+ 
++.SH LOG LEVEL
++
++Each log entry has a level prefix that describes the severity of the log to
++help users determine which logs are more valuable.
++Currently, three levels are used:.TP
++
++.B "ALERT"
++The uncorrected hardware error has been fixed, but cause side effects.
++.TP
++.B "CRIT"
++The uncorrected hardware error has been detected.
++.TP
++.B "ERROR"
++The corrected hardware error has been detected.
++
+ .SH SEE ALSO
+ \fBras-mc-ctl\fR(8)
+ 
+diff --git a/ras-aer-handler.c b/ras-aer-handler.c
+index 471ad9f..c67f267 100644
+--- a/ras-aer-handler.c
++++ b/ras-aer-handler.c
+@@ -123,6 +123,25 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 	uint8_t sel_data[5];
+ 	int seg, bus, dev, fn, rc;
+ #endif
++	const char *level;
++
++	if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0)
++		return -1;
++	switch (severity_val) {
++	case HW_EVENT_AER_UNCORRECTED_NON_FATAL:
++		level = loglevel_str[LOGLEVEL_CRIT];
++		break;
++	case HW_EVENT_AER_UNCORRECTED_FATAL:
++		level = loglevel_str[LOGLEVEL_EMERG];
++		break;
++	case HW_EVENT_AER_CORRECTED:
++		level = loglevel_str[LOGLEVEL_ERR];
++		break;
++	default:
++		level = loglevel_str[LOGLEVEL_DEBUG];
++		break;
++	}
++	trace_seq_printf(s, "%s ", level);
+ 
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+@@ -156,9 +175,6 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 	if (tep_get_field_val(s,  event, "status", record, &status_val, 1) < 0)
+ 		return -1;
+ 
+-	if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0)
+-		return -1;
+-
+ 	/* Fills the error buffer. If it is a correctable error then use the
+ 	 * aer_cor_errors bit field. Otherwise use aer_uncor_errors.
+ 	 */
+diff --git a/ras-arm-handler.c b/ras-arm-handler.c
+index db29327..226feb3 100644
+--- a/ras-arm-handler.c
++++ b/ras-arm-handler.c
+@@ -489,6 +489,8 @@ int ras_arm_event_handler(struct trace_seq *s,
+ 
+ 	memset(&ev, 0, sizeof(ev));
+ 
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
++
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ 	 * On previous kernels, the way to properly generate an event would
+diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
+index 6e5ddea..575fff8 100644
+--- a/ras-cxl-handler.c
++++ b/ras-cxl-handler.c
+@@ -133,6 +133,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	struct ras_cxl_poison_event ev;
+ 
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
+ 	if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ 		return -1;
+@@ -345,6 +346,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
+ 	struct ras_cxl_aer_ue_event ev;
+ 
+ 	memset(&ev, 0, sizeof(ev));
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_CRIT]);
+ 	get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
+ 	if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ 		return -1;
+@@ -431,6 +433,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	struct ras_cxl_aer_ce_event ev;
+ 
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
+ 	if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ 		return -1;
+@@ -516,6 +519,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s,
+ 	struct ras_cxl_overflow_event ev;
+ 
+ 	memset(&ev, 0, sizeof(ev));
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
+ 	if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ 		return -1;
+@@ -733,6 +737,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s,
+ 	const uint8_t *buf;
+ 
+ 	memset(&ev, 0, sizeof(ev));
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ 		return -1;
+ 
+@@ -848,6 +853,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
+ 	struct ras_cxl_general_media_event ev;
+ 
+ 	memset(&ev, 0, sizeof(ev));
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ 		return -1;
+ 
+@@ -1038,6 +1044,7 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
+ 	struct ras_cxl_dram_event ev;
+ 
+ 	memset(&ev, 0, sizeof(ev));
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ 		return -1;
+ 
+diff --git a/ras-devlink-handler.c b/ras-devlink-handler.c
+index da5645d..93eba91 100644
+--- a/ras-devlink-handler.c
++++ b/ras-devlink-handler.c
+@@ -83,6 +83,8 @@ int ras_devlink_event_handler(struct trace_seq *s,
+ 	if (ras->filters[DEVLINK_EVENT] &&
+ 	    tep_filter_match(ras->filters[DEVLINK_EVENT], record) == FILTER_MATCH)
+ 		return 0;
++
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ 	 * On previous kernels, the way to properly generate an event would
+diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c
+index 43c023b..6044efa 100644
+--- a/ras-diskerror-handler.c
++++ b/ras-diskerror-handler.c
+@@ -57,6 +57,7 @@ int ras_diskerror_event_handler(struct trace_seq *s,
+ 	struct diskerror_event ev;
+ 	uint32_t dev;
+ 
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]);
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ 	 * On previous kernels, the way to properly generate an event would
+diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c
+index 46c06cf..56acf1a 100644
+--- a/ras-extlog-handler.c
++++ b/ras-extlog-handler.c
+@@ -208,6 +208,26 @@ static void report_extlog_mem_event(struct ras_events *ras,
+ 				    struct trace_seq *s,
+ 				    struct ras_extlog_event *ev)
+ {
++	const char *level;
++
++	switch (ev->severity) {
++	case 0:
++		level = loglevel_str[LOGLEVEL_CRIT];
++		break;
++	case 1:
++		level = loglevel_str[LOGLEVEL_EMERG];
++		break;
++	case 2:
++		level = loglevel_str[LOGLEVEL_ERR];
++		break;
++	case 3:
++		level = loglevel_str[LOGLEVEL_INFO];
++		break;
++	default:
++		level = loglevel_str[LOGLEVEL_DEBUG];
++		break;
++	}
++	trace_seq_printf(s, "%s ", level);
+ 	trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s",
+ 			 ev->error_seq, err_severity(ev->severity),
+ 		err_type(ev->etype), ev->address,
+diff --git a/ras-mc-handler.c b/ras-mc-handler.c
+index a729d93..e55c199 100644
+--- a/ras-mc-handler.c
++++ b/ras-mc-handler.c
+@@ -36,7 +36,7 @@ static int ras_mc_event_stat(time_t now, struct ras_mc_event *e)
+ 	}
+ 
+ 	if (per_sec_ce_count > mc_ce_stat_threshold)
+-		log(ALL, LOG_ERR, "    mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count);
++		log(ALL, LOG_ERR, "    mc_event_stat: %s memory corrected error report %lld/sec\n", loglevel_str[LOGLEVEL_ALERT], per_sec_ce_count);
+ 
+ 	return 0;
+ }
+@@ -52,6 +52,46 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 	struct tm *tm;
+ 	struct ras_mc_event ev;
+ 	int parsed_fields = 0;
++	const char *level;
++
++	if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0)
++		goto parse_error;
++	parsed_fields++;
++
++	switch (val) {
++	case HW_EVENT_ERR_CORRECTED:
++		ev.error_type = "Corrected";
++		break;
++	case HW_EVENT_ERR_UNCORRECTED:
++		ev.error_type = "Uncorrected";
++		break;
++	case HW_EVENT_ERR_DEFERRED:
++		ev.error_type = "Deferred";
++		break;
++	case HW_EVENT_ERR_FATAL:
++		ev.error_type = "Fatal";
++		break;
++	case HW_EVENT_ERR_INFO:
++	default:
++		ev.error_type = "Info";
++	}
++
++	switch (val) {
++	case HW_EVENT_ERR_UNCORRECTED:
++	case HW_EVENT_ERR_DEFERRED:
++		level = loglevel_str[LOGLEVEL_CRIT];
++		break;
++	case HW_EVENT_ERR_FATAL:
++		level = loglevel_str[LOGLEVEL_EMERG];
++		break;
++	case HW_EVENT_ERR_CORRECTED:
++		level = loglevel_str[LOGLEVEL_ERR];
++		break;
++	default:
++		level = loglevel_str[LOGLEVEL_DEBUG];
++		break;
++	}
++	trace_seq_printf(s, "%s ", level);
+ 
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+@@ -80,28 +120,6 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 	ev.error_count = val;
+ 	trace_seq_printf(s, "%d ", ev.error_count);
+ 
+-	if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0)
+-		goto parse_error;
+-	parsed_fields++;
+-
+-	switch (val) {
+-	case HW_EVENT_ERR_CORRECTED:
+-		ev.error_type = "Corrected";
+-		break;
+-	case HW_EVENT_ERR_UNCORRECTED:
+-		ev.error_type = "Uncorrected";
+-		break;
+-	case HW_EVENT_ERR_DEFERRED:
+-		ev.error_type = "Deferred";
+-		break;
+-	case HW_EVENT_ERR_FATAL:
+-		ev.error_type = "Fatal";
+-		break;
+-	case HW_EVENT_ERR_INFO:
+-	default:
+-		ev.error_type = "Info";
+-	}
+-
+ 	trace_seq_puts(s, ev.error_type);
+ 	if (ev.error_count > 1)
+ 		trace_seq_puts(s, " errors:");
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index 92c5339..c272bb0 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -290,7 +290,16 @@ void report_mce_event(struct ras_events *ras, struct tep_record *record,
+ 	time_t now;
+ 	struct tm *tm;
+ 	struct mce_priv *mce = ras->mce_priv;
++	const char *level;
+ 
++	if (e->status & MCI_STATUS_UC)
++		level = loglevel_str[LOGLEVEL_CRIT];
++	else if (e->status & MCI_STATUS_DEFERRED)
++		level = loglevel_str[LOGLEVEL_CRIT];
++	else
++		level = loglevel_str[LOGLEVEL_ERR];
++
++	trace_seq_printf(s, "%s ", level);
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ 	 * On previous kernels, the way to properly generate an event would
+diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
+index 0f4e937..43e7c5d 100644
+--- a/ras-memory-failure-handler.c
++++ b/ras-memory-failure-handler.c
+@@ -119,6 +119,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
+ 	struct tm *tm;
+ 	struct ras_mf_event ev;
+ 
++	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]);
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ 	 * On previous kernels, the way to properly generate an event would
+diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c
+index aabe8f9..2758d14 100644
+--- a/ras-nvgpu-nvml.c
++++ b/ras-nvgpu-nvml.c
+@@ -58,12 +58,12 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 	if (data->eventType == nvmlEventTypeXidCriticalError) {
+ 		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
+ 			"<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME);
+-		trace_seq_printf(&s, "%s ", timestamp);
++		trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp);
+ 		trace_seq_printf(&s, "xid: %lld ", data->eventData);
+ 	} else {
+ 		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
+ 			"<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME);
+-		trace_seq_printf(&s, "%s ", timestamp);
++		trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp);
+ 		trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType);
+ 		trace_seq_printf(&s, "data: %lld ", data->eventData);
+ 	}
+diff --git a/ras-page-isolation.c b/ras-page-isolation.c
+index 246cd12..237495c 100644
+--- a/ras-page-isolation.c
++++ b/ras-page-isolation.c
+@@ -17,6 +17,7 @@
+ #include "ras-page-isolation.h"
+ #include "ras-poison-page-stat.h"
+ #include "ras-record.h"
++#include "types.h"
+ 
+ #define PARSED_ENV_LEN 50
+ #define ROW_ID_MAX_LEN 200
+@@ -349,8 +350,8 @@ static void page_offline(struct page_record *pr)
+ 
+ 	pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE;
+ 
+-	log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n",
+-	    addr, page_state[pr->offlined]);
++	log(TERM, LOG_INFO, "%s Result of offlining page at %#llx: %s\n",
++	    loglevel_str[LOGLEVEL_ALERT], addr, page_state[pr->offlined]);
+ 
+ #ifdef HAVE_POISON_PAGE_STAT
+ 	ras_poison_page_stat();
+diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c
+index 2ce1d2a..c8d8859 100644
+--- a/ras-poison-page-stat.c
++++ b/ras-poison-page-stat.c
+@@ -34,8 +34,8 @@ int ras_poison_page_stat(void)
+ 	fclose(fp);
+ 
+ 	if (corrupted_kb > poison_stat_threshold)
+-		log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n",
+-		    corrupted_kb, poison_stat_threshold);
++		log(ALL, LOG_WARNING, "%s Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n",
++			loglevel_str[LOGLEVEL_ALERT], corrupted_kb, poison_stat_threshold);
+ 
+ 	return 0;
+ }
+diff --git a/ras-signal-handler.c b/ras-signal-handler.c
+index fb0bfd3..c497bf0 100644
+--- a/ras-signal-handler.c
++++ b/ras-signal-handler.c
+@@ -61,8 +61,8 @@ static char *signal_res[] = {
+ static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev)
+ {
+ 	trace_seq_printf(s,
+-			 "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s",
+-			 ev->timestamp, strsignal(ev->sig), ev->error_no,
++			 "%s %s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s",
++			 loglevel_str[LOGLEVEL_ALERT], ev->timestamp, strsignal(ev->sig), ev->error_no,
+ 			 (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code],
+ 			 ev->comm, ev->pid,
+ 			 ev->group,
+diff --git a/types.c b/types.c
+new file mode 100644
+index 0000000..d4270ac
+--- /dev/null
++++ b/types.c
+@@ -0,0 +1,18 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++ * Copyright (C) 2025 Alibaba Inc
++ */
++
++#include "types.h"
++
++const char *loglevel_str[] = {
++	[LOGLEVEL_EMERG]	= "[EMERG]",
++	[LOGLEVEL_ALERT]	= "[ALERT]",
++	[LOGLEVEL_CRIT]		= "[CRIT]",
++	[LOGLEVEL_ERR]		= "[ERROR]",
++	[LOGLEVEL_WARNING]	= "[WARNING]",
++	[LOGLEVEL_NOTICE]	= "[NOTICE]",
++	[LOGLEVEL_INFO]		= "[INFO]",
++	[LOGLEVEL_DEBUG]	= "[DEBUG]",
++};
+\ No newline at end of file
+diff --git a/types.h b/types.h
+index 58cac1f..8563919 100644
+--- a/types.h
++++ b/types.h
+@@ -189,4 +189,15 @@ static inline size_t strscat(char *dst, const char *src, size_t dsize)
+ 		      "pointer type mismatch in container_of()");	\
+ 	((type *)(__mptr - offsetof(type, member))); })
+ 
++#define LOGLEVEL_DEFAULT	-1	/* default (or last) loglevel */
++#define LOGLEVEL_EMERG		0	/* system is unusable */
++#define LOGLEVEL_ALERT		1	/* action must be taken immediately */
++#define LOGLEVEL_CRIT		2	/* critical conditions */
++#define LOGLEVEL_ERR		3	/* error conditions */
++#define LOGLEVEL_WARNING	4	/* warning conditions */
++#define LOGLEVEL_NOTICE		5	/* normal but significant condition */
++#define LOGLEVEL_INFO		6	/* informational */
++#define LOGLEVEL_DEBUG		7	/* debug-level messages */
++
++extern const char *loglevel_str[];
+ #endif
+-- 
+2.43.5
+
diff --git a/1014-anolis-syslog-add-rasdaemon.ext.patch b/1014-anolis-syslog-add-rasdaemon.ext.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9bbc6f3727b9361d9131ae4481c74ea01799bfb3
--- /dev/null
+++ b/1014-anolis-syslog-add-rasdaemon.ext.patch
@@ -0,0 +1,250 @@
+From b4e1a8c87a7c079c35db5190067808df4ae471a6 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 3 Apr 2025 15:16:09 +0800
+Subject: [PATCH 14/30] anolis: syslog: add rasdaemon.ext
+
+Filter aer/pcihp/cmcistorm event through syslog-ng/rsyslog
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                     | 24 ++++++++++-
+ misc/rasdaemon.rsyslog-ext.in   | 26 ++++++++++++
+ misc/rasdaemon.spec.in          | 10 +++++
+ misc/rasdaemon.syslog-ng-ext.in | 71 +++++++++++++++++++++++++++++++++
+ 5 files changed, 131 insertions(+), 2 deletions(-)
+ create mode 100644 misc/rasdaemon.rsyslog-ext.in
+ create mode 100644 misc/rasdaemon.syslog-ng-ext.in
+
+diff --git a/Makefile.am b/Makefile.am
+index 564a20d..ab26412 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -11,17 +11,25 @@ LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in
+ LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate)
+ RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in
+ RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog)
++SYSLOG_EXT_SERVICES_IN = misc/rasdaemon.syslog-ng-ext.in
++SYSLOG_EXT_SERVICES = $(SYSLOG_EXT_SERVICES_IN:.syslog-ng-ext.in=.syslog-ng-ext)
++RSYSLOG_EXT_SERVICES_IN = misc/rasdaemon.rsyslog-ext.in
++RSYSLOG_EXT_SERVICES = $(RSYSLOG_EXT_SERVICES_IN:.rsyslog-ext.in=.rsyslog-ext)
+ EXTRA_DIST = \
+ 	$(SYSTEMD_SERVICES_IN) \
+ 	$(SYSLOG_SERVICES_IN) \
+ 	$(RSYSLOG_SERVICES_IN) \
+ 	$(LOGROTATE_SERVICES_IN) \
++	$(SYSLOG_EXT_SERVICES_IN) \
++	$(RSYSLOG_EXT_SERVICES_IN) \
+ 	misc/rasdaemon.env \
+ 	contrib/nvml.py \
+ 	contrib/*_trigger
+ 
+ CLEANFILES= \
+ 	ras-nvgpu-nvml.h \
++	misc/rasdaemon.syslog-ng-ext	\
++	misc/rasdaemon.rsyslog-ext	\
+ 	misc/ras-mc-ctl.service	\
+ 	misc/rasdaemon.service \
+ 	misc/rasdaemon.syslog-ng \
+@@ -33,7 +41,7 @@ DISTCLEANFILES = misc/rasdaemon.spec
+ # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin
+ # during ./configure phase, therefore it is not possible to add .service.in
+ # files to AC_CONFIG_FILES in configure.ac
+-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog
++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog .rsyslog-ext.in .rsyslog-ext .syslog-ng-ext.in .syslog-ng-ext
+ .service.in.service:
+ 	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@
+ 
+@@ -46,9 +54,15 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n
+ .rsyslog.in.rsyslog:
+ 	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
+ 
++.syslog-ng-ext.in.syslog-ng-ext:
++	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
++
++.rsyslog-ext.in.rsyslog-ext:
++	sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@
++
+ # This rule is needed because the service files must be generated on target
+ # system after ./configure phase
+-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES)
++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) $(SYSLOG_EXT_SERVICES) $(RSYSLOG_EXT_SERVICES)
+ 
+ sbin_PROGRAMS = rasdaemon
+ rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
+@@ -179,4 +193,10 @@ install-data-local:
+ 	if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \
+ 	fi
++	if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/" ]; then \
++		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng-ext "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.syslog-ng-ext"; \
++	fi
++	if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \
++		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \
++	fi
+ 	$(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/"
+diff --git a/misc/rasdaemon.rsyslog-ext.in b/misc/rasdaemon.rsyslog-ext.in
+new file mode 100644
+index 0000000..63cffc2
+--- /dev/null
++++ b/misc/rasdaemon.rsyslog-ext.in
+@@ -0,0 +1,26 @@
++# SPDX-License-Identifier: GPL-2.0
++
++template(name="rasdaemon_temp" type="string" string="%timegenerated% %hostname% rasdaemon: %$!event%: %$!level% %msg%\n")
++
++if ($syslogfacility-text == "kern" and $msg contains "CMCI storm") then {
++        set $!event = "cmci_storm";
++
++        if $msg contains "detected" then set $!level = "[ALERT]";
++        if $msg contains "subsided" then set $!level = "[ERROR]";
++        action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp")
++}
++
++if ($syslogfacility-text == "kern" and $msg contains "AER: device recovery") then {
++        set $!event = "aer_recovery";
++
++        if $msg contains "failed" then set $!level = "[EMERG]";
++        if $msg contains "successful" then set $!level = "[ALERT]";
++        action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp")
++}
++
++if ($syslogfacility-text == "kern" and $msg contains "pciehp: Slot") then {
++        set $!event = "pciehp";
++        if $msg contains "Link Down" then set $!level = "[ALERT]";
++        if $msg contains "Card not present" then set $!level = "[ALERT]";
++        action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp")
++}
+\ No newline at end of file
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index a30045c..521f148 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -57,6 +57,8 @@ install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{na
+ install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng
+ install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate
+ install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog
++install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext
++install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext
+ rm INSTALL %{buildroot}/usr/include/*.h
+ 
+ %files
+@@ -71,18 +73,24 @@ rm INSTALL %{buildroot}/usr/include/*.h
+ %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng
+ %config(noreplace) /usr/share/%{name}/%{name}.logrotate
+ %config(noreplace) /usr/share/%{name}/%{name}.rsyslog
++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext
++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext
+ 
+ %post
+ if systemctl is-active --quiet syslog-ng.service; then
+     echo "Syslog service is enabled and running, create config file and restart it";
+     rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+     ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
++    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
++    ln -s /usr/share/%{name}/%{name}.syslog-ng-ext %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
+     systemctl restart syslog-ng.service;
+ fi
+ if systemctl is-active --quiet rsyslog.service; then
+     echo "Rsyslog service is enabled and running, create config file and restart it";
+     rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
+     ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf;
++    rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
++    ln -s /usr/share/%{name}/%{name}.rsyslog-ext %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
+     systemctl restart rsyslog.service;
+ fi
+ if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+@@ -103,11 +111,13 @@ systemctl disable %{name}.service
+ if systemctl is-active --quiet syslog-ng.service; then
+     echo "Syslog-ng service is enabled and running, delete config file and restart it";
+     rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
++    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
+     systemctl restart syslog-ng.service;
+ fi
+ if systemctl is-active --quiet rsyslog.service; then
+     echo "Rsyslog service is enabled and running, delete config file and restart it";
+     rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
++    rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
+     systemctl restart rsyslog.service;
+ fi
+ if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+diff --git a/misc/rasdaemon.syslog-ng-ext.in b/misc/rasdaemon.syslog-ng-ext.in
+new file mode 100644
+index 0000000..ad001d2
+--- /dev/null
++++ b/misc/rasdaemon.syslog-ng-ext.in
+@@ -0,0 +1,71 @@
++# SPDX-License-Identifier: GPL-2.0
++
++destination d_ras  {
++        file("/var/log/rasdaemon"
++                template("${DATE} ${HOST} rasdaemon: ${RASDAEMON_EVENT}: ${RASDAEMON_LEVEL} ${MESSAGE}\n")
++        persist-name(ras-ext));
++};
++
++filter f_aer {
++        facility(kern) and
++        match("AER: device recovery" value("MESSAGE"));
++};
++
++rewrite r_aer {
++        set("aer_recovery", value("RASDAEMON_EVENT"));
++        set("[EMERG]", value("RASDAEMON_LEVEL")
++                condition(match("failed" value("MESSAGE")))
++        );
++        set("[ALERT]", value("RASDAEMON_LEVEL")
++                condition(match("successful" value("MESSAGE")))
++        );
++};
++
++filter f_cmcistorm {
++        facility(kern) and
++        match("CMCI storm" value("MESSAGE"));
++};
++
++rewrite r_cmcistorm {
++        set("cmci_storm", value("RASDAEMON_EVENT"));
++        set("[ALERT]", value("RASDAEMON_LEVEL")
++                condition(match("detected" value("MESSAGE")))
++        );
++        set("[ERROR]", value("RASDAEMON_LEVEL")
++                condition(match("subsided" value("MESSAGE")))
++        );
++};
++
++filter f_pciehp {
++        facility(kern) and
++        match("pciehp: Slot" value("MESSAGE"));
++};
++
++rewrite r_pciehp {
++        set("pciehp", value("RASDAEMON_EVENT"));
++        set("[ALERT]", value("RASDAEMON_LEVEL")
++                condition(match("Link Down" value("MESSAGE")))
++        );
++        set("[ALERT]", value("RASDAEMON_LEVEL")
++                condition(match("Card not present" value("MESSAGE")))
++        );
++};
++
++log {
++        source(s_sys);
++        junction {
++                channel {
++                        filter(f_cmcistorm);
++                        rewrite(r_cmcistorm);
++                };
++                channel {
++                        filter(f_pciehp);
++                        rewrite(r_pciehp);
++                };
++                channel {
++                        filter(f_aer);
++                        rewrite(r_aer);
++                };
++        };
++        destination(d_ras);
++};
+\ No newline at end of file
+-- 
+2.43.5
+
diff --git a/1015-rasdaemon-add-page-offline-trigger.patch b/1015-rasdaemon-add-page-offline-trigger.patch
new file mode 100644
index 0000000000000000000000000000000000000000..20480a043a15f5ababeca2ba1492d6c2d5fc88f1
--- /dev/null
+++ b/1015-rasdaemon-add-page-offline-trigger.patch
@@ -0,0 +1,238 @@
+From e9995846c39321300a9c89936086222fab3cbb1c Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 13 Dec 2024 14:38:02 +0800
+Subject: [PATCH 15/30] rasdaemon: add page offline trigger
+
+page offline include pre trigger and post trigger.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ contrib/page_offline_post_trigger | 25 ++++++++++++++++++
+ contrib/page_offline_pre_trigger  | 25 ++++++++++++++++++
+ misc/rasdaemon.env                |  5 ++++
+ ras-page-isolation.c              |  4 +++
+ trigger.c                         | 43 +++++++++++++++++++++++++++++++
+ trigger.h                         |  6 +++++
+ 6 files changed, 108 insertions(+)
+ create mode 100755 contrib/page_offline_post_trigger
+ create mode 100755 contrib/page_offline_pre_trigger
+
+diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger
+new file mode 100755
+index 0000000..4d3329c
+--- /dev/null
++++ b/contrib/page_offline_post_trigger
+@@ -0,0 +1,25 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occurred, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# TIMESTAMP             Timestamp when error occurred
++# ADDR                  Address
++# OTYPE                 POST | PRE
++#
++
++[ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local
++
++if [ -d page_offline_post_trigger.extern ]
++then
++    ls page_offline_post_trigger.extern |
++    while read item
++    do
++        [ -x ./page_offline_post_trigger.extern/$item ] && . ./page_offline_post_trigger.extern/$item $1
++    done
++fi
++
++
++exit 0
+diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger
+new file mode 100755
+index 0000000..e464382
+--- /dev/null
++++ b/contrib/page_offline_pre_trigger
+@@ -0,0 +1,25 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occurred, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# TIMESTAMP             Timestamp when error occurred
++# ADDR                  Address
++# OTYPE                 POST | PRE
++#
++
++[ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local
++
++if [ -d page_offline_pre_trigger.extern ]
++then
++    ls page_offline_pre_trigger.extern |
++    while read item
++    do
++        [ -x ./page_offline_pre_trigger.extern/$item ] && . ./page_offline_pre_trigger.extern/$item $1
++    done
++fi
++
++
++exit 0
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 1f5da55..f3f17c2 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -108,6 +108,11 @@ AER_CE_TRIGGER_TIMEOUT=0
+ AER_UE_TRIGGER_TIMEOUT=0
+ AER_FATAL_TRIGGER_TIMEOUT=0
+ 
++PRE_PAGE_OFFLINE_TRIGGER=
++POST_PAGE_OFFLINE_TRIGGER=
++PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
++POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
++
+ # CE Statistic Threshold
+ #
+ # Specify the threshold of CE per second.
+diff --git a/ras-page-isolation.c b/ras-page-isolation.c
+index 237495c..569293f 100644
+--- a/ras-page-isolation.c
++++ b/ras-page-isolation.c
+@@ -18,6 +18,7 @@
+ #include "ras-poison-page-stat.h"
+ #include "ras-record.h"
+ #include "types.h"
++#include "trigger.h"
+ 
+ #define PARSED_ENV_LEN 50
+ #define ROW_ID_MAX_LEN 200
+@@ -296,6 +297,7 @@ void ras_page_account_init(void)
+ {
+ 	page_offline_init();
+ 	page_isolation_init();
++	setup_event_trigger("page_offline");
+ }
+ 
+ static int do_page_offline(unsigned long long addr, enum otype type)
+@@ -303,6 +305,7 @@ static int do_page_offline(unsigned long long addr, enum otype type)
+ 	int fd, rc;
+ 	char buf[20];
+ 
++	run_page_offline_trigger(addr, type, PRE);
+ 	fd = open(kernel_offline[type], O_WRONLY);
+ 	if (fd == -1) {
+ 		log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__,
+@@ -318,6 +321,7 @@ static int do_page_offline(unsigned long long addr, enum otype type)
+ 		    buf, kernel_offline[type], errno);
+ 
+ 	close(fd);
++	run_page_offline_trigger(addr, type, POST);
+ 	return rc;
+ }
+ 
+diff --git a/trigger.c b/trigger.c
+index a13fffd..7387113 100644
+--- a/trigger.c
++++ b/trigger.c
+@@ -11,6 +11,7 @@
+ #include "types.h"
+ #include "trigger.h"
+ 
++#include "ras-events.h"
+ #include "ras-mce-handler.h"
+ 
+ #define MAX_ENV 30
+@@ -95,6 +96,9 @@ struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"};
+ struct event_trigger aer_ue_trigger = {"aer_event", "AER_UE_TRIGGER"};
+ struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"};
+ 
++struct event_trigger pre_page_offline_trigger = {"page_offline", "PRE_PAGE_OFFLINE_TRIGGER"};
++struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFFLINE_TRIGGER"};
++
+ static struct event_trigger *event_triggers[] = {
+ 	&mc_ue_trigger,
+ #ifdef HAVE_MCE
+@@ -109,6 +113,10 @@ static struct event_trigger *event_triggers[] = {
+ 	&aer_ue_trigger,
+ 	&aer_fatal_trigger,
+ #endif
++#ifdef HAVE_MEMORY_CE_PFA
++	&pre_page_offline_trigger,
++	&post_page_offline_trigger,
++#endif
+ };
+ 
+ void setup_event_trigger(const char *event)
+@@ -358,6 +366,32 @@ static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *tr
+ 			goto free;
+ 	if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0)
+ 		goto free;
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (i = 0; i < ei; i++)
++		free(env[i]);
++}
++
++static void __run_page_offline_trigger(unsigned long long addr, int otype,
++				       struct event_trigger *trigger)
++{
++	char *env[MAX_ENV];
++	int ei = 0;
++	int i;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "ADDR=%#llx", addr) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "OTYPE=%d", otype) < 0)
++		goto free;
+ 
+ 	env[ei] = NULL;
+ 	assert(ei < MAX_ENV);
+@@ -378,3 +412,12 @@ void run_aer_event_trigger(struct ras_aer_event *e)
+ 	else if (!strcmp(e->error_type, "Uncorrected (Fatal)"))
+ 		__run_aer_trigger(e, &aer_fatal_trigger);
+ }
++
++void run_page_offline_trigger(unsigned long long addr, int otype, int type)
++{
++	if (type == POST)
++		__run_page_offline_trigger(addr, otype, &post_page_offline_trigger);
++	else
++		__run_page_offline_trigger(addr, otype, &pre_page_offline_trigger);
++}
++
+diff --git a/trigger.h b/trigger.h
+index 31eff96..74df3d3 100644
+--- a/trigger.h
++++ b/trigger.h
+@@ -5,6 +5,11 @@
+ 
+ #include "ras-record.h"
+ 
++enum page_offline_trigger_type {
++	PRE,
++	POST,
++};
++
+ struct event_trigger {
+ 	const char *event_name;
+ 	const char *env;
+@@ -21,5 +26,6 @@ void run_mc_event_trigger(struct ras_mc_event *e);
+ void run_mce_record_trigger(struct mce_event *e);
+ void run_mf_event_trigger(struct ras_mf_event *e);
+ void run_aer_event_trigger(struct ras_aer_event *e);
++void run_page_offline_trigger(unsigned long long addr, int otype, int type);
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1016-anolis-compta-rasdaemon-notices.patch b/1016-anolis-compta-rasdaemon-notices.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e13915c789c78aa257f0159edd013fc0a0ad9070
--- /dev/null
+++ b/1016-anolis-compta-rasdaemon-notices.patch
@@ -0,0 +1,129 @@
+From c1182ad260e0161817d0a4bbea31bcfe5fe7dbd3 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 13 Dec 2024 14:38:02 +0800
+Subject: [PATCH 16/30] anolis: compta rasdaemon notices
+
+page offline include pre trigger and post trigger.
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                              |  1 +
+ contrib/page_offline_post_trigger        |  2 ++
+ contrib/page_offline_pre_trigger         |  2 ++
+ misc/notices/page-ce-offline-post-notice | 16 ++++++++++++++++
+ misc/notices/page-ce-offline-pre-notice  | 18 ++++++++++++++++++
+ misc/rasdaemon.spec.in                   |  3 +++
+ 6 files changed, 42 insertions(+)
+ create mode 100644 misc/notices/page-ce-offline-post-notice
+ create mode 100644 misc/notices/page-ce-offline-pre-notice
+
+diff --git a/Makefile.am b/Makefile.am
+index ab26412..61f9a84 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -23,6 +23,7 @@ EXTRA_DIST = \
+ 	$(SYSLOG_EXT_SERVICES_IN) \
+ 	$(RSYSLOG_EXT_SERVICES_IN) \
+ 	misc/rasdaemon.env \
++	misc/notices \
+ 	contrib/nvml.py \
+ 	contrib/*_trigger
+ 
+diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger
+index 4d3329c..ad7d44c 100755
+--- a/contrib/page_offline_post_trigger
++++ b/contrib/page_offline_post_trigger
+@@ -12,6 +12,8 @@
+ 
+ [ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local
+ 
++[ -x /etc/rasdaemon_notices/page-ce-offline-post-notice ] && . /etc/rasdaemon_notices/page-ce-offline-post-notice $(printf "%lu" "$ADDR")
++
+ if [ -d page_offline_post_trigger.extern ]
+ then
+     ls page_offline_post_trigger.extern |
+diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger
+index e464382..6d8d3f2 100755
+--- a/contrib/page_offline_pre_trigger
++++ b/contrib/page_offline_pre_trigger
+@@ -12,6 +12,8 @@
+ 
+ [ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local
+ 
++[ -x /etc/rasdaemon_notices/page-ce-offline-pre-notice ] && . /etc/rasdaemon_notices/page-ce-offline-pre-notice $(printf "%lu" "$ADDR")
++
+ if [ -d page_offline_pre_trigger.extern ]
+ then
+     ls page_offline_pre_trigger.extern |
+diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice
+new file mode 100644
+index 0000000..01966af
+--- /dev/null
++++ b/misc/notices/page-ce-offline-post-notice
+@@ -0,0 +1,16 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon after a page goes offline.
++
++cd /etc/rasdaemon_notices/
++
++[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1
++
++if [ -d page-ce-offline-post-notice.extern ]
++then
++    ls page-ce-offline-post-notice.extern |
++    while read item
++    do
++        [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1
++    done
++fi
+\ No newline at end of file
+diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice
+new file mode 100644
+index 0000000..187556c
+--- /dev/null
++++ b/misc/notices/page-ce-offline-pre-notice
+@@ -0,0 +1,18 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon before a page goes offline.
++
++cd /etc/rasdaemon_notices/
++
++[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1
++
++if [ -d page-ce-offline-pre-notice.extern ]
++then
++    ls page-ce-offline-pre-notice.extern |
++    while read item
++    do
++        [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1
++    done
++fi
++
++exit 0
+\ No newline at end of file
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index 521f148..23be188 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -59,6 +59,8 @@ install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{na
+ install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog
+ install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext
+ install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext
++install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/
++install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/
+ rm INSTALL %{buildroot}/usr/include/*.h
+ 
+ %files
+@@ -75,6 +77,7 @@ rm INSTALL %{buildroot}/usr/include/*.h
+ %config(noreplace) /usr/share/%{name}/%{name}.rsyslog
+ %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext
+ %config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext
++%{_sysconfdir}/rasdaemon_notices/*
+ 
+ %post
+ if systemctl is-active --quiet syslog-ng.service; then
+-- 
+2.43.5
+
diff --git a/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch
new file mode 100644
index 0000000000000000000000000000000000000000..637a05b8de3e0623722d23c6d5129e05b3e06d05
--- /dev/null
+++ b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch
@@ -0,0 +1,631 @@
+From 637a69ee5de5376eb185ea390cd07d8b9e5d4747 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Mon, 9 Dec 2024 16:28:54 +0800
+Subject: [PATCH 17/30] anolis: rasdaemon: add rasdaemon json exporter
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am                  |   3 +
+ configure.ac                 |  16 +++
+ misc/rasdaemon.env           |   2 +
+ ras-aer-handler.c            |   9 +-
+ ras-arm-handler.c            |   6 +-
+ ras-mc-handler.c             |  11 +-
+ ras-mce-handler.c            |   7 +-
+ ras-mce-handler.h            |   1 +
+ ras-memory-failure-handler.c |   6 +-
+ ras-record.h                 |   9 ++
+ ras-report-json.c            | 238 +++++++++++++++++++++++++++++++++++
+ ras-report.h                 |  14 +++
+ ras-signal-handler.c         |   2 +-
+ rasdaemon.c                  |   8 ++
+ 14 files changed, 326 insertions(+), 6 deletions(-)
+ create mode 100644 ras-report-json.c
+
+diff --git a/Makefile.am b/Makefile.am
+index 61f9a84..1f21137 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -104,6 +104,9 @@ endif
+ if WITH_ABRT_REPORT
+    rasdaemon_SOURCES += ras-report.c
+ endif
++if WITH_JSON_REPORT
++   rasdaemon_SOURCES += ras-report-json.c
++endif
+ if WITH_HISI_NS_DECODE
+    rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
+ endif
+diff --git a/configure.ac b/configure.ac
+index 43d845d..c5164ec 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -170,6 +170,21 @@ AS_IF([test "x$enable_abrt_report" = "xyes" || test "x$enable_all" = "xyes"], [
+ AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes || test x$enable_all = xyes])
+ AM_COND_IF([WITH_ABRT_REPORT], [USE_ABRT_REPORT="yes"], [USE_ABRT_REPORT="no"])
+ 
++AC_ARG_ENABLE([json_report],
++    AS_HELP_STRING([--enable-json-report], [enable storing data at SQL lite database (currently experimental)]))
++
++AS_IF([test "x$enable_json_report" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_CHECK_LIB(pci, pci_lookup_name,[echo "found pci"] , AC_MSG_ERROR([*** Unable to find pci library]), )
++  PCI_LIBS="-lpci"
++  AC_DEFINE(HAVE_JSON_REPORT,1,"have libpci")
++  AC_SUBST([WITH_JSON_REPORT])
++])
++
++AM_CONDITIONAL([WITH_JSON_REPORT], [test x$enable_json_report = xyes || test x$enable_all == xyes])
++AM_COND_IF([WITH_JSON_REPORT], [USE_JSON_REPORT="yes"], [USE_JSON_REPORT="no"])
++
++AC_SUBST([PCI_LIBS])
++
+ AC_ARG_ENABLE([hisi_ns_decode],
+     AS_HELP_STRING([--enable-hisi-ns-decode], [enable HISI_NS_DECODE events (currently experimental)]))
+ 
+@@ -337,4 +352,5 @@ compile time options summary
+     Signal              : $USE_SIGNAL
+     ERST                : $USE_ERST
+     NVGPU RAS errors    : $USE_NVGPU
++    Json exporter       : $USE_JSON_REPORT
+ EOF
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index f3f17c2..085d839 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -73,6 +73,8 @@ CPU_ISOLATION_CYCLE="24h"
+ # Prevent excessive isolation from causing an avalanche effect
+ CPU_ISOLATION_LIMIT="10"
+ 
++DISABLE="json_report"
++
+ # Event Trigger
+ 
+ # Event trigger will be executed when the specified event occurs.
+diff --git a/ras-aer-handler.c b/ras-aer-handler.c
+index c67f267..023dd4d 100644
+--- a/ras-aer-handler.c
++++ b/ras-aer-handler.c
+@@ -115,7 +115,7 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	time_t now;
+ 	struct tm *tm;
+-	struct ras_aer_event ev;
++	struct ras_aer_event ev = { 0 };
+ 	char buf[BUF_LEN] = { 0 };
+ 	uint16_t vendor_id = 0, device_id = 0;
+ #ifdef HAVE_AMP_NS_DECODE
+@@ -207,24 +207,28 @@ int ras_aer_event_handler(struct trace_seq *s,
+ #ifdef HAVE_AMP_NS_DECODE
+ 		sel_data[0] = 0xca;
+ #endif
++		ev.severity = GHES_SEV_RECOVERABLE;
+ 		break;
+ 	case HW_EVENT_AER_UNCORRECTED_FATAL:
+ 		ev.error_type = "Uncorrected (Fatal)";
+ #ifdef HAVE_AMP_NS_DECODE
+ 		sel_data[0] = 0xca;
+ #endif
++		ev.severity = GHES_SEV_PANIC;
+ 		break;
+ 	case HW_EVENT_AER_CORRECTED:
+ 		ev.error_type = "Corrected";
+ #ifdef HAVE_AMP_NS_DECODE
+ 		sel_data[0] = 0xbf;
+ #endif
++		ev.severity = GHES_SEV_CORRECTED;
+ 		break;
+ 	default:
+ 		ev.error_type = "Unknown severity";
+ #ifdef HAVE_AMP_NS_DECODE
+ 		sel_data[0] = 0xbf;
+ #endif
++		ev.severity = GHES_SEV_NO;
+ 	}
+ 	trace_seq_puts(s, ev.error_type);
+ 
+@@ -271,6 +275,9 @@ int ras_aer_event_handler(struct trace_seq *s,
+ 			return -1;
+ #endif
+ 
++#ifdef HAVE_JSON_REPORT
++	report_aer_event_json(s, &ev);
++#endif
+ 	run_aer_event_trigger(&ev);
+ 
+ 	return 0;
+diff --git a/ras-arm-handler.c b/ras-arm-handler.c
+index 226feb3..431dd9b 100644
+--- a/ras-arm-handler.c
++++ b/ras-arm-handler.c
+@@ -484,7 +484,7 @@ int ras_arm_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	time_t now;
+ 	struct tm *tm;
+-	struct ras_arm_event ev;
++	struct ras_arm_event ev = { 0 };
+ 	int len = 0;
+ 
+ 	memset(&ev, 0, sizeof(ev));
+@@ -606,5 +606,9 @@ int ras_arm_event_handler(struct trace_seq *s,
+ 	ras_report_arm_event(ras, &ev);
+ #endif
+ 
++#ifdef HAVE_JSON_REPORT
++	report_arm_event_json(s, &ev);
++#endif
++
+ 	return 0;
+ }
+diff --git a/ras-mc-handler.c b/ras-mc-handler.c
+index e55c199..2ffaf2e 100644
+--- a/ras-mc-handler.c
++++ b/ras-mc-handler.c
+@@ -17,6 +17,7 @@
+ #include "ras-mc-handler.h"
+ #include "ras-page-isolation.h"
+ #include "ras-report.h"
++#include "ras-events.h"
+ #include "trigger.h"
+ #include "types.h"
+ 
+@@ -50,7 +51,7 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	time_t now;
+ 	struct tm *tm;
+-	struct ras_mc_event ev;
++	struct ras_mc_event ev = { 0 };
+ 	int parsed_fields = 0;
+ 	const char *level;
+ 
+@@ -61,19 +62,23 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 	switch (val) {
+ 	case HW_EVENT_ERR_CORRECTED:
+ 		ev.error_type = "Corrected";
++		ev.severity = GHES_SEV_CORRECTED;
+ 		break;
+ 	case HW_EVENT_ERR_UNCORRECTED:
+ 		ev.error_type = "Uncorrected";
++		ev.severity = GHES_SEV_RECOVERABLE;
+ 		break;
+ 	case HW_EVENT_ERR_DEFERRED:
+ 		ev.error_type = "Deferred";
+ 		break;
+ 	case HW_EVENT_ERR_FATAL:
+ 		ev.error_type = "Fatal";
++		ev.severity = GHES_SEV_PANIC;
+ 		break;
+ 	case HW_EVENT_ERR_INFO:
+ 	default:
+ 		ev.error_type = "Info";
++		ev.severity = GHES_SEV_NO;
+ 	}
+ 
+ 	switch (val) {
+@@ -249,6 +254,10 @@ int ras_mc_event_handler(struct trace_seq *s,
+ 
+ 	run_mc_event_trigger(&ev);
+ 
++#ifdef HAVE_JSON_REPORT
++	report_mc_event_json(s, &ev);
++#endif
++
+ 	return 0;
+ 
+ parse_error:
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index c272bb0..b61976a 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -18,6 +18,7 @@
+ #include "ras-report.h"
+ #include "types.h"
+ #include "trigger.h"
++#include "ras-events.h"
+ 
+ /*
+  * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code,
+@@ -507,7 +508,7 @@ int ras_mce_event_handler(struct trace_seq *s,
+ 	unsigned long long val;
+ 	struct ras_events *ras = context;
+ 	struct mce_priv *mce = ras->mce_priv;
+-	struct mce_event e;
++	struct mce_event e = { 0 };
+ 	int rc = 0;
+ 
+ 	memset(&e, 0, sizeof(e));
+@@ -608,6 +609,10 @@ int ras_mce_event_handler(struct trace_seq *s,
+ 	ras_report_mce_event(ras, &e);
+ #endif
+ 
++#ifdef HAVE_JSON_REPORT
++	report_mce_event_json(s, &e);
++#endif
++
+ 	run_mce_record_trigger(&e);
+ 
+ 	return 0;
+diff --git a/ras-mce-handler.h b/ras-mce-handler.h
+index f120874..d2031cf 100644
+--- a/ras-mce-handler.h
++++ b/ras-mce-handler.h
+@@ -68,6 +68,7 @@ struct mce_event {
+ 	int32_t		vdata_len;
+ 	const uint64_t	*vdata;
+ 
++	int		severity;
+ 	/* Parsed data */
+ 	char		frutext[17];
+ 	char		timestamp[64];
+diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
+index 43e7c5d..df90244 100644
+--- a/ras-memory-failure-handler.c
++++ b/ras-memory-failure-handler.c
+@@ -117,7 +117,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
+ 	struct ras_events *ras = context;
+ 	time_t now;
+ 	struct tm *tm;
+-	struct ras_mf_event ev;
++	struct ras_mf_event ev = { 0 };
+ 
+ 	trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]);
+ 	/*
+@@ -172,5 +172,9 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
+ #endif
+ 	run_mf_event_trigger(&ev);
+ 
++#ifdef HAVE_JSON_REPORT
++	report_mf_event_json(s, &ev);
++#endif
++
+ 	return 0;
+ }
+diff --git a/ras-record.h b/ras-record.h
+index ce7d12c..7f49b74 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -16,6 +16,13 @@
+ #include "config.h"
+ #include "types.h"
+ 
++static const char * const severity_strs[] = {
++	"info",
++	"corrected",
++	"recoverable",
++	"fatal",
++};
++
+ extern long user_hz;
+ 
+ struct ras_events;
+@@ -23,6 +30,7 @@ struct ras_events;
+ struct ras_mc_event {
+ 	char timestamp[64];
+ 	int error_count;
++	int severity;
+ 	const char *error_type, *msg, *label;
+ 	unsigned char mc_index;
+ 	signed char top_layer, middle_layer, lower_layer;
+@@ -44,6 +52,7 @@ struct ras_aer_event {
+ 	char timestamp[64];
+ 	const char *error_type;
+ 	char *dev_name;
++	int severity;
+ 	uint8_t tlp_header_valid;
+ 	uint32_t *tlp_header;
+ 	const char *msg;
+diff --git a/ras-report-json.c b/ras-report-json.c
+new file mode 100644
+index 0000000..b1c33a4
+--- /dev/null
++++ b/ras-report-json.c
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 and
++ * only version 2 as published by the Free Software Foundation.
++
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ */
++
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/utsname.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <pci/pci.h>
++
++#include "traceevent/event-parse.h"
++#include "ras-report.h"
++
++#define NONE ""
++int json_report = 1;
++
++void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev)
++{
++	if (!s || !ev || !json_report)
++		return;
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"%s\", "
++			 "\"timestamp\": \"%s\", "
++			 "\"severity\": \"%s\", "
++			 "\"error_count\": %d, "
++			 "\"error_type\": \"%s\", "
++			 "\"msg\": \"%s\", "
++			 "\"label\": \"%s\", "
++			 "\"location\": \"%d:%d:%d:%d\", "
++			 "\"address\": \"%#llx\", "
++			 "\"grain\": \"%#llx\", "
++			 "\"syndrome\": \"%#llx\", "
++			 "\"driver_detail\": \"%s\" }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 severity_strs[ev->severity],
++			 ev->error_count,
++			 (ev->error_type) ? ev->error_type : NONE,
++			 (ev->msg) ? ev->msg : NONE,
++			 (ev->label) ? ev->label : NONE,
++			 ev->mc_index, ev->top_layer, ev->middle_layer, ev->lower_layer,
++			 ev->address,
++			 ev->grain,
++			 ev->syndrome,
++			 (ev->driver_detail) ? ev->driver_detail : NONE);
++}
++
++static void get_pci_dev_name(const char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id)
++{
++	struct pci_access *pacc;
++	struct pci_dev *dev;
++	struct pci_filter filter = {0};
++	int domain, bus, device, function;
++
++	pacc = pci_alloc();
++	if (!pacc)
++		return;
++	pci_init(pacc);
++	pci_scan_bus(pacc);
++
++	if (!pci_name)
++		goto free;
++
++	if (sscanf(bdf, "%x:%x.%x", &bus, &device, &function) == 3)
++		domain = 0;
++	else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &device) == 3)
++		function = 0;
++	else if (sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &device, &function) != 4)
++		goto free;
++
++	pci_filter_init(pacc, &filter);
++	filter.bus = bus;
++	filter.slot = device;
++	filter.func = function;
++	filter.domain = domain;
++
++	for (dev = pacc->devices; dev; dev = dev->next) {
++		if (pci_filter_match(&filter, dev)) {
++			pci_fill_info(dev, PCI_FILL_IDENT);
++			*vendor_id = dev->vendor_id;
++			*device_id = dev->device_id;
++			pci_lookup_name(pacc, pci_name, len,
++					PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE,
++					dev->vendor_id, dev->device_id);
++			break;
++		}
++	}
++
++free:
++	pci_cleanup(pacc);
++}
++
++void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev)
++{
++	char pci_name[128];
++	u16 vendor = 0, device = 0;
++
++	if (!s || !ev || !json_report)
++		return;
++
++	get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device);
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"aer_event\", "	\
++			 "\"timestamp\": \"%s\", "	\
++			 "\"severity\": \"%s\", "	\
++			 "\"error_type\": \"%s\", "	\
++			 "\"dev_name\": \"%s\", "	\
++			 "\"pci_dev_name\": \"%s\", "	\
++			 "\"vendor_id\": \"%#x\", "	\
++			 "\"device_id\": \"%#x\", "	\
++			 "\"msg\": \"%s\" }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 severity_strs[ev->severity],
++			 (ev->error_type) ? ev->error_type : NONE,
++			 (ev->dev_name) ? ev->dev_name : NONE,
++			 (*pci_name) ? pci_name : NONE,
++			 vendor, device,
++			 (ev->msg) ? ev->msg : NONE);
++}
++
++void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev)
++{
++	if (!s || !ev || !json_report)
++		return;
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"arm_event\", "	\
++			 "\"timestamp\": \"%s\", "	\
++			 "\"error_count\": %d, "	\
++			 "\"affinity\": %d, "		\
++			 "\"mpidr\": \"%#lx\", "	\
++			 "\"midr\": \"%#lx\", "		\
++			 "\"running_state\": %d, "	\
++			 "\"psci_state\": %d }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 ev->error_count,
++			 ev->affinity,
++			 ev->mpidr,
++			 ev->midr,
++			 ev->running_state,
++			 ev->psci_state);
++}
++
++void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev)
++{
++	if (!s || !ev || !json_report)
++		return;
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"mf_event\", \"timestamp\": \"%s\", "
++			 "\"pfn\": %s, \"page_type\": \"%s\", "
++			 "\"action_result\": \"%s\" }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 (*ev->pfn) ? ev->pfn : NONE,
++			 (ev->page_type) ? ev->page_type : NONE,
++			 (ev->action_result) ? ev->action_result : NONE);
++}
++
++void report_mce_event_json(struct trace_seq *s, struct mce_event *ev)
++{
++	if (!s || !ev || !json_report)
++		return;
++
++	if (ev->status & MCI_STATUS_UC)
++		ev->severity = GHES_SEV_RECOVERABLE;
++	else if (ev->status & MCI_STATUS_DEFERRED)
++		ev->severity = GHES_SEV_RECOVERABLE;
++	else
++		ev->severity = GHES_SEV_CORRECTED;
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"%s\", "
++			 "\"timestamp\": \"%s\", "
++			 "\"severity\": \"%s\", "
++			 "\"bank\": %d, "
++			 "\"bank_name\": \"%s\", "
++			 "\"status\": \"%#lx\", "
++			 "\"error_msg\": \"%s\", "
++			 "\"mcistatus_msg\": \"%s\", "
++			 "\"mcastatus_msg\": \"%s\", "
++			 "\"user_action\": \"%s\", "
++			 "\"mc_location\": \"%s\", "
++			 "\"cpuid\": \"%#x\", "
++			 "\"cpu\": %d, "
++			 "\"socketid\": %d, "
++			 "\"ip\": \"%#lx\", "
++			 "\"cs\": \"%#x\", "
++			 "\"misc\": \"%#lx\", "
++			 "\"addr\": \"%#lx\", "
++			 "\"synd\": \"%#lx\", "
++			 "\"ipid\": \"%#lx\", "
++			 "\"mcgstatus_msg\": \"%s\", "
++			 "\"mcgstatus\": \"%#lx\", "
++			 "\"mcgcap\": \"%#lx\", "
++			 "\"apicid\": \"%#x\" }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 severity_strs[ev->severity],
++			 ev->bank,
++			 (*ev->bank_name) ? ev->bank_name : NONE,
++			 ev->status,
++			 (*ev->error_msg) ? ev->error_msg : NONE,
++			 (*ev->mcistatus_msg) ? ev->mcistatus_msg : NONE,
++			 (*ev->mcastatus_msg) ? ev->mcastatus_msg : NONE,
++			 (*ev->user_action) ? ev->user_action : NONE,
++			 (*ev->mc_location) ? ev->mc_location : NONE,
++			 ev->cpuid,
++			 ev->cpu,
++			 ev->socketid,
++			 ev->ip,
++			 ev->cs,
++			 ev->misc,
++			 ev->addr,
++			 ev->synd,
++			 ev->ipid,
++			 (*ev->mcgstatus_msg) ? ev->mcgstatus_msg : NONE,
++			 ev->mcgstatus,
++			 ev->mcgcap,
++			 ev->apicid);
++}
++
+diff --git a/ras-report.h b/ras-report.h
+index f680a25..eeb25bb 100644
+--- a/ras-report.h
++++ b/ras-report.h
+@@ -23,6 +23,12 @@
+ /* ABRT socket file */
+ #define ABRT_SOCKET "/var/run/abrt/abrt.socket"
+ 
++#ifdef HAVE_JSON_REPORT
++#define JSON_REPORT_KEY "rasdaemon_event_name"
++
++extern int json_report;
++#endif
++
+ #ifdef HAVE_ABRT_REPORT
+ 
+ int ras_report_mc_event(struct ras_events *ras,
+@@ -115,4 +121,12 @@ static inline int ras_report_signal_event(struct ras_events *ras,
+ { return 0; };
+ #endif
+ 
++#ifdef HAVE_JSON_REPORT
++void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev);
++void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev);
++void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev);
++void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
++void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
++#endif
++
+ #endif
+diff --git a/ras-signal-handler.c b/ras-signal-handler.c
+index e8f7f1d..d15c4f6 100644
+--- a/ras-signal-handler.c
++++ b/ras-signal-handler.c
+@@ -78,7 +78,7 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record,
+ 	struct ras_events *ras = context;
+ 	time_t now;
+ 	struct tm *tm;
+-	struct ras_signal_event ev;
++	struct ras_signal_event ev = { 0 };
+ 
+ 	/*
+ 	 * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+diff --git a/rasdaemon.c b/rasdaemon.c
+index 9c5f9dd..d5d2f85 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -16,6 +16,7 @@
+ #include "ras-logger.h"
+ #include "ras-poison-page-stat.h"
+ #include "ras-record.h"
++#include "ras-report.h"
+ #include "ras-mc-handler.h"
+ #include "ras-pcie-edpc.h"
+ #include "ras-nvgpu.h"
+@@ -146,6 +147,13 @@ int main(int argc, char *argv[])
+ 		log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold);
+ #endif
+ 
++#ifdef HAVE_JSON_REPORT
++	if (choices_disable &&
++	    strlen(choices_disable) != 0 &&
++	    strstr(choices_disable, "json_report"))
++		json_report = 0;
++#endif
++
+ #ifdef HAVE_MCE
+ 	const struct argp_option offline_options[] = {
+ 		{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+-- 
+2.43.5
+
diff --git a/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch
new file mode 100644
index 0000000000000000000000000000000000000000..19a3fd5ceedb8a3c7913d248426f048a69ed5370
--- /dev/null
+++ b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch
@@ -0,0 +1,998 @@
+From 340a8af496dd80a719e27e6395f96c8d75cf6f36 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Wed, 11 Dec 2024 16:16:30 +0800
+Subject: [PATCH 18/30] anolis: rasdaemon: kmsg_monitor: introduce kmsg_monitor
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |   6 +-
+ configure.ac       |  11 +++
+ misc/rasdaemon.env |  43 +++++++++-
+ ras-events.c       | 114 +++++++++++++++++++++++--
+ ras-kmsg.c         | 203 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-kmsg.h         |  47 +++++++++++
+ ras-report-json.c  |  68 ++++++++++++++-
+ ras-report.h       |   2 +
+ ras-time.c         | 103 +++++++++++++++++++++++
+ ras-time.h         |  27 ++++++
+ rasdaemon.c        |  14 ++++
+ trigger.c          |  55 ++++++++++++
+ trigger.h          |   3 +
+ 13 files changed, 685 insertions(+), 11 deletions(-)
+ create mode 100644 ras-kmsg.c
+ create mode 100644 ras-kmsg.h
+ create mode 100644 ras-time.c
+ create mode 100644 ras-time.h
+
+diff --git a/Makefile.am b/Makefile.am
+index 1f21137..68b354b 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -134,6 +134,9 @@ endif
+ if WITH_SIGNAL
+    rasdaemon_SOURCES += ras-signal-handler.c
+ endif
++if WITH_KMSG_MONITOR
++   rasdaemon_SOURCES += ras-kmsg.c ras-time.c
++endif
+ 
+ if WITH_POISON_PAGE_STAT
+    rasdaemon_SOURCES += ras-poison-page-stat.c
+@@ -159,7 +162,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
+ 		  ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \
+ 		  non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \
+-		  ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h
++		  ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h \
++		  ras-kmsg.h ras-time.h
+ 
+ # This rule can't be called with more than one Makefile job (like make -j8)
+ # I can't figure out a way to fix that
+diff --git a/configure.ac b/configure.ac
+index c5164ec..dfb7f02 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -303,6 +303,16 @@ AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [
+ AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"])
+ 
++AC_ARG_ENABLE([kmsg_monitor],
++    AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)]))
++
++AS_IF([test "x$enable_kmsg_monitor" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_DEFINE(HAVE_KMSG_MONITOR,1,"have kmsg monitor")
++  AC_SUBST([WITH_KMSG_MONITOR])
++])
++AM_CONDITIONAL([WITH_KMSG_MONITOR], [test x$enable_kmsg_monitor = xyes || test x$enable_all == xyes])
++AM_COND_IF([WITH_KMSG_MONITOR], [USE_KMSG_MONITOR="yes"], [USE_KMSG_MONITOR="no"])
++
+ test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
+ 
+ CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
+@@ -353,4 +363,5 @@ compile time options summary
+     ERST                : $USE_ERST
+     NVGPU RAS errors    : $USE_NVGPU
+     Json exporter       : $USE_JSON_REPORT
++    Kmsg monitor        : $USE_KMSG_MONITOR
+ EOF
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 085d839..f498e24 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h"
+ # Prevent excessive isolation from causing an avalanche effect
+ CPU_ISOLATION_LIMIT="10"
+ 
+-DISABLE="json_report"
++DISABLE="json_report,kmsg_monitor"
+ 
+ # Event Trigger
+ 
+@@ -115,6 +115,10 @@ POST_PAGE_OFFLINE_TRIGGER=
+ PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
+ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
+ 
++#trigger for kmsg
++KMSG_TRIGGER=
++KMSG_TRIGGER_TIMEOUT=0
++
+ # CE Statistic Threshold
+ #
+ # Specify the threshold of CE per second.
+@@ -145,3 +149,40 @@ EDPC_DEVICE=
+ # For example:
+ #   NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock
+ NVGPU_DISABLE_EVENT="0x10"
++
++
++# KMSG MONITOR
++KMSG_IGNORE_XID=""
++KMSG_LIMIT=100
++KMSG_TRACE_NUM=6
++KMSG_TRACE_END=1
++
++KMSG_TRACER_NAME_0="xid"
++KMSG_TRACER_REGEX_0="NVRM: Xid \\(PCI:(.*)( GPU-I:[0-9]+)?( GPU-CI:[0-9]+)?\\): ([0-9]+), pid=([^,]*)(, name=([^,]*))?, (.*)"
++KMSG_TRACER_GROUP_COUNT_0=8
++KMSG_TRACER_GROUP_KEY_0="pci_port,gpu-i,gpu-ci,xid,pid,has_name,name,msg"
++
++KMSG_TRACER_NAME_1="sxid"
++KMSG_TRACER_REGEX_1="nvidia-nvswitch[0-9]+: SXid \\(PCI:(.*)\\): ([0-9]+), (.*)"
++KMSG_TRACER_GROUP_COUNT_1=3
++KMSG_TRACER_GROUP_KEY_1="pci_port,xid,msg"
++
++KMSG_TRACER_NAME_2="axid"
++KMSG_TRACER_REGEX_2="PPU.* Xid \\((.*)\\): ([0-9]+)(, pid=([^,]*))?, (.*)"
++KMSG_TRACER_GROUP_COUNT_2=5
++KMSG_TRACER_GROUP_KEY_2="pci_port,xid,has_pid,pid,msg"
++
++KMSG_TRACER_NAME_3="aer_recovery"
++KMSG_TRACER_REGEX_3="pcieport (.*): AER: device recovery (successful|failed)"
++KMSG_TRACER_GROUP_COUNT_3=2
++KMSG_TRACER_GROUP_KEY_3="pci_port,res"
++
++KMSG_TRACER_NAME_4="pcihp"
++KMSG_TRACER_REGEX_4="pcieport (.*): pciehp: Slot\\(([0-9]+)\\): (Link Up|Link Down|Card present|Card not present|Link Down/Up ignored \\(recovered by DPC\\))"
++KMSG_TRACER_GROUP_COUNT_4=3
++KMSG_TRACER_GROUP_KEY_4="pci_port,slot,res"
++
++KMSG_TRACER_NAME_5="cmci_storm"
++KMSG_TRACER_REGEX_5="CMCI storm (.*): switching to .* mode"
++KMSG_TRACER_GROUP_COUNT_5=1
++KMSG_TRACER_GROUP_KEY_5="storm"
+diff --git a/ras-events.c b/ras-events.c
+index 06f9a37..d40f29e 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -14,6 +14,8 @@
+ #include <string.h>
+ #include <sys/poll.h>
+ #include <sys/signalfd.h>
++#include <sys/time.h>
++#include <time.h>
+ #include <sys/stat.h>
+ #include <sys/types.h>
+ #include <traceevent/event-parse.h>
+@@ -37,6 +39,25 @@
+ #include "ras-signal-handler.h"
+ #include "ras-record.h"
+ #include "trigger.h"
++#include "ras-kmsg.h"
++
++#ifdef HAVE_KMSG_MONITOR
++#define NS_PER_SEC 1000000000L
++
++static struct timespec ts_sub(struct timespec a, struct timespec b)
++{
++	struct timespec result = {
++		.tv_sec = a.tv_sec - b.tv_sec,
++		.tv_nsec = a.tv_nsec - b.tv_nsec
++	};
++
++	if (result.tv_nsec < 0) {
++		result.tv_sec -= 1;
++		result.tv_nsec += NS_PER_SEC;
++	}
++	return result;
++}
++#endif
+ 
+ /*
+  * Polling time, if read() doesn't block. Currently, trace_pipe_raw never
+@@ -464,12 +485,22 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 	int ready, i, count_nready;
+ 	struct kbuffer *kbuf;
+ 	void *page;
+-	struct pollfd fds[n_cpus + 1];
+ 	struct signalfd_siginfo fdsiginfo;
+ 	sigset_t mask;
+ 	int warnonce[n_cpus];
+ 	char pipe_raw[PATH_MAX];
+ 	int legacy_kernel = 0;
++#ifdef HAVE_KMSG_MONITOR
++	int fd_num = n_cpus + 2;
++	char kmsg_buf[PRINTK_MESSAGE_MAX];
++	int limit = 0;
++	struct timespec limit_time = { 0 };
++	int need_sleep = 0;
++#else
++	int fd_num = n_cpus + 1;
++#endif
++	struct pollfd fds[fd_num];
++
+ 
+ 	memset(&warnonce, 0, sizeof(warnonce));
+ 
+@@ -496,7 +527,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 	if (set_buffer_percent(pdata[0].ras, 0))
+ 		log(TERM, LOG_WARNING, "Set buffer_percent failed\n");
+ 
+-	for (i = 0; i < (n_cpus + 1); i++)
++	for (i = 0; i < fd_num; i++)
+ 		fds[i].fd = -1;
+ 
+ 	for (i = 0; i < n_cpus; i++) {
+@@ -527,6 +558,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 		goto error;
+ 	}
+ 
++#ifdef HAVE_KMSG_MONITOR
++	if (kmsg_monitor) {
++		fds[n_cpus + 1].events = POLLIN;
++		fds[n_cpus + 1].fd = open("/dev/kmsg", O_RDONLY);
++		if (fds[n_cpus + 1].fd < 0) {
++			log(TERM, LOG_ERR, "open /dev/kmsg\n");
++			goto error;
++		}
++
++		if (kmsg_trace_end) {
++			off_t offset = lseek(fds[n_cpus + 1].fd, 0, SEEK_END);
++
++			if (offset == -1) {
++				log(TERM, LOG_ERR, "Can not seek kmsg end\n");
++				goto error;
++			}
++		}
++	}
++#endif
++
+ 	log(TERM, LOG_INFO, "Listening to events for cpus 0 to %d\n", n_cpus - 1);
+ 	if (pdata[0].ras->record_events) {
+ 		if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras))
+@@ -538,7 +589,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 	}
+ 
+ 	do {
+-		ready = poll(fds, (n_cpus + 1), -1);
++		ready = poll(fds, fd_num, -1);
+ 		if (ready < 0)
+ 			log(TERM, LOG_WARNING, "poll\n");
+ 
+@@ -564,6 +615,40 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 		}
+ 
+ 		count_nready = 0;
++#ifdef HAVE_KMSG_MONITOR
++		/* read from kmsg */
++		if (kmsg_monitor && (fds[n_cpus + 1].revents & POLLIN)) {
++			size = read(fds[n_cpus + 1].fd, kmsg_buf, PRINTK_MESSAGE_MAX);
++			if (size < 0) {
++				log(TERM, LOG_WARNING, "read kmsg %s\n", strerror(errno));
++			} else if (size > 0) {
++				kmsg_buf[size] = '\0';
++				kmsg_match(kmsg_buf);
++				memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX);
++			} else {
++				count_nready++;
++			}
++			limit++;
++			if (kmsg_limit && limit >= kmsg_limit) {
++				struct timespec tv, res;
++
++				clock_gettime(CLOCK_MONOTONIC, &tv);
++
++				res = ts_sub(tv, limit_time);
++				if (res.tv_sec == 0 && res.tv_nsec >= 0 && res.tv_nsec < (0.5 * NS_PER_SEC)) {
++					need_sleep = 1;
++					log(TERM, LOG_WARNING, "kmsg limit %lx!\n", res.tv_nsec);
++				}
++
++				limit = 0;
++				limit_time = tv;
++			}
++
++		} else {
++			count_nready++;
++		}
++#endif
++
+ 		for (i = 0; i < n_cpus; i++) {
+ 			if (fds[i].revents & POLLERR) {
+ 				if (!warnonce[i]) {
+@@ -599,11 +684,18 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 				count_nready++;
+ 			}
+ 		}
++#ifdef HAVE_KMSG_MONITOR
++		if (need_sleep) {
++			usleep(500000);
++			need_sleep = 0;
++		}
++#endif
++
+ 		/*
+ 		 * If we enable fallback mode, it will always be used, as
+ 		 * poll is still not working fine, IMHO
+ 		 */
+-		if (count_nready == n_cpus) {
++		if (count_nready == fd_num) {
+ 			/* Should only happen with legacy kernels */
+ 			legacy_kernel = 1;
+ 			break;
+@@ -627,7 +719,7 @@ error:
+ 	free(page);
+ 	sigprocmask(SIG_UNBLOCK, &mask, NULL);
+ 
+-	for (i = 0; i < (n_cpus + 1); i++) {
++	for (i = 0; i < fd_num; i++) {
+ 		if (fds[i].fd > 0)
+ 			close(fds[i].fd);
+ 	}
+@@ -991,6 +1083,13 @@ int handle_ras_events(int record_events, int enable_ipmitool)
+ 	ras_page_account_init();
+ #endif
+ 
++#ifdef HAVE_KMSG_MONITOR
++	if (kmsg_monitor) {
++		if (kmsg_tracer_init())
++			goto err;
++	}
++#endif
++
+ 	rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event",
+ 			       ras_mc_event_handler, NULL, MC_EVENT);
+ 	if (!rc)
+@@ -1269,5 +1368,10 @@ err:
+ #ifdef HAVE_MEMORY_ROW_CE_PFA
+ 	row_record_infos_free();
+ #endif
++#ifdef HAVE_KMSG_MONITOR
++	if (kmsg_monitor)
++		kmsg_tracer_destroy();
++#endif
++
+ 	return rc;
+ }
+diff --git a/ras-kmsg.c b/ras-kmsg.c
+new file mode 100644
+index 0000000..2dd47d6
+--- /dev/null
++++ b/ras-kmsg.c
+@@ -0,0 +1,203 @@
++#define _GNU_SOURCE
++#include <regex.h>
++#include <string.h>
++#include <stdlib.h>
++
++#include "ras-logger.h"
++#include "ras-report.h"
++#include "ras-kmsg.h"
++#include "trigger.h"
++
++int kmsg_monitor = 1;
++int kmsg_trace_end;
++int kmsg_limit;
++
++struct kmsg_tracer_info *kmsg_tracer;
++int kmsg_tracer_num;
++
++int kmsg_match(char *msg)
++{
++	int ret, group_count, i;
++	regex_t *regex;
++	regmatch_t *matches;
++	char tmpbuf[256];
++
++	for (i = 0; i < kmsg_tracer_num; i++) {
++		regex = &kmsg_tracer[i].regex_c;
++		matches = &kmsg_tracer[i].matches[0];
++		group_count = kmsg_tracer[i].group_count;
++
++		ret = regexec(regex, msg, group_count, matches, 0);
++		if (ret > REG_NOMATCH) {
++			regerror(ret, regex, tmpbuf, sizeof(tmpbuf));
++			log(ALL, LOG_ERR, "Regex execution error: %s\n", tmpbuf);
++			return 1;
++		} else if (ret == REG_NOMATCH) {
++			continue;
++		}
++
++#ifdef HAVE_JSON_REPORT
++		report_kmsg_event_json(&kmsg_tracer[i], msg);
++#endif
++		run_kmsg_trigger(kmsg_tracer, msg);
++
++		break;
++	}
++
++	return 0;
++}
++
++int kmsg_tracer_destroy(void)
++{
++	log(ALL, LOG_INFO, "kmsg tracer destroy\n");
++
++	if (!kmsg_tracer)
++		return 0;
++	for (int i = 0; i < kmsg_tracer_num; i++) {
++		if (!kmsg_tracer[i].name)
++			free(kmsg_tracer[i].name);
++		if (!kmsg_tracer[i].regex)
++			free(kmsg_tracer[i].regex);
++		if (!kmsg_tracer[i].matches)
++			free(kmsg_tracer[i].matches);
++		if (!kmsg_tracer[i].group_key)
++			continue;
++		for (int j = 0; j < kmsg_tracer[i].group_count; j++)
++			if (!kmsg_tracer[i].group_key[j])
++				free(kmsg_tracer[i].group_key[j]);
++			else
++				continue;
++		if (!kmsg_tracer[i].group_key)
++			free(kmsg_tracer[i].group_key);
++	}
++	free(kmsg_tracer);
++
++	return 0;
++}
++
++int kmsg_tracer_init(void)
++{
++	char *s;
++	int kmsg_tracer_group_count, ret, c = 0;
++	char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp;
++	char *kmsg_tracer_group_key, *token;
++
++	s = getenv(KMSG_TRACE_END);
++	if (!s)
++		kmsg_trace_end = 0;
++	else
++		kmsg_trace_end = atoi(s);
++
++	s = getenv(KMSG_TRACE_NUM);
++	if (!s)
++		return 0;
++
++	kmsg_tracer_num = atoi(s);
++	if (kmsg_tracer_num <= 0)
++		return 0;
++
++	s = getenv(KMSG_LIMIT);
++	if (s) {
++		kmsg_limit = atoi(s);
++		if (kmsg_limit < 0)
++			return -1;
++	}
++
++	kmsg_tracer = calloc(kmsg_tracer_num, sizeof(struct kmsg_tracer_info));
++	if (!kmsg_tracer)
++		return -1;
++
++	for (int i = 0; i < kmsg_tracer_num; i++) {
++		// trace name
++		snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_NAME, i);
++		kmsg_tracer_name = getenv(buf);
++		if (!kmsg_tracer_name || ((strlen(kmsg_tracer_name) > NAME_LEN)))
++			return -1;
++		kmsg_tracer[i].name = strdup(kmsg_tracer_name);
++		if (!kmsg_tracer[i].name)
++			return -1;
++
++		// tracer regex
++		snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_REGEX, i);
++		kmsg_tracer_regex = getenv(buf);
++		if (!kmsg_tracer_regex || (strlen(kmsg_tracer_regex) > BUF_LEN))
++			return -1;
++		snprintf(buf, 1026, "%s\n", kmsg_tracer_regex);
++		kmsg_tracer[i].regex = strdup(buf);
++		if (!kmsg_tracer[i].regex)
++			return -1;
++
++		// tracer group cpunt
++		snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_GROUP_COUNT, i);
++		tmp = getenv(buf);
++		if (!tmp)
++			return -1;
++		kmsg_tracer_group_count = atoi(tmp);
++		if (kmsg_tracer_group_count < 0)
++			return -1;
++		kmsg_tracer_group_count++;
++		kmsg_tracer[i].group_count = kmsg_tracer_group_count;
++		kmsg_tracer[i].group_key = calloc(kmsg_tracer_group_count, sizeof(char *));
++		if (!kmsg_tracer[i].group_key)
++			return -1;
++
++		// tracer group key
++		snprintf(buf, sizeof(buf), "%s_%d", KMSG_TRACER_GROUP_KEY, i);
++		kmsg_tracer_group_key = strdup(getenv(buf));
++		if (!kmsg_tracer_group_key || (strlen(kmsg_tracer_group_key) > BUF_LEN))
++			return -1;
++
++		c = 0;
++		token = strtok(kmsg_tracer_group_key, ",");
++		while (token) {
++			kmsg_tracer[i].group_key[c++] = strdup(token);
++			if (c >= kmsg_tracer_group_count)
++				break;
++			token = strtok(NULL, ",");
++		}
++		free(kmsg_tracer_group_key);
++
++		ret = regcomp(&kmsg_tracer[i].regex_c, kmsg_tracer[i].regex, REG_EXTENDED);
++		if (ret) {
++			regerror(ret, &kmsg_tracer[i].regex_c, buf, sizeof(buf));
++			log(ALL, LOG_ERR, "Regex execution error: %s\n", buf);
++			return ret;
++		}
++
++		kmsg_tracer[i].matches = calloc(kmsg_tracer_group_count, sizeof(regmatch_t));
++		if (!kmsg_tracer[i].matches)
++			return -1;
++
++		if (!strcmp("xid", kmsg_tracer[i].name) ||
++		    !strcmp("sxid", kmsg_tracer[i].name) ||
++		    !strcmp("axid", kmsg_tracer[i].name)) {
++			char *s = getenv(KMSG_IGNORE_XID);
++			char *ignore;
++			char *xid_token;
++
++			if (!s)
++				continue;
++
++			ignore = strdup(s);
++			if (ignore) {
++				c = 0;
++				xid_token = strtok(ignore, ",");
++				while (xid_token) {
++					kmsg_tracer[i].info.xid.ignore_xid[c++] = atoi(xid_token);
++					if (c >= 30) {
++						free(ignore);
++						continue;
++					}
++					xid_token = strtok(NULL, ",");
++				}
++				kmsg_tracer[i].info.xid.len = c;
++			}
++
++			free(ignore);
++		}
++	}
++
++	setup_event_trigger("kmsg_monitor");
++
++	return 0;
++}
+diff --git a/ras-kmsg.h b/ras-kmsg.h
+new file mode 100644
+index 0000000..f31125f
+--- /dev/null
++++ b/ras-kmsg.h
+@@ -0,0 +1,47 @@
++
++#ifndef __RAS_KMSG_H
++#define __RAS_KMSG_H
++
++#include <regex.h>
++
++/**
++ * Kernel message tracer related definitions
++ */
++#define KMSG_TRACE_NUM	"KMSG_TRACE_NUM"
++#define KMSG_TRACER_NAME "KMSG_TRACER_NAME"
++#define KMSG_TRACER_REGEX "KMSG_TRACER_REGEX"
++#define KMSG_TRACER_GROUP_COUNT "KMSG_TRACER_GROUP_COUNT"
++#define KMSG_TRACER_GROUP_KEY "KMSG_TRACER_GROUP_KEY"
++
++#define KMSG_TRACE_END	"KMSG_TRACE_END"
++#define KMSG_IGNORE_XID	"KMSG_IGNORE_XID"
++#define KMSG_LIMIT	"KMSG_LIMIT"
++
++#define NAME_LEN 64
++#define BUF_LEN 1024
++#define PRINTK_MESSAGE_MAX 2048
++
++extern int kmsg_monitor;
++extern int kmsg_trace_end;
++extern int kmsg_limit;
++
++struct kmsg_tracer_info {
++	char *name;
++	char *regex;
++	int group_count;
++	char **group_key;
++	regex_t regex_c;
++	regmatch_t *matches;
++	union {
++		struct {
++			int len;
++			int ignore_xid[30];
++		} xid;
++	} info;
++};
++
++int kmsg_tracer_init(void);
++int kmsg_tracer_destroy(void);
++int kmsg_match(char *msg);
++
++#endif
+diff --git a/ras-report-json.c b/ras-report-json.c
+index b1c33a4..2d35355 100644
+--- a/ras-report-json.c
++++ b/ras-report-json.c
+@@ -11,17 +11,17 @@
+  * GNU General Public License for more details.
+  */
+ 
++#include <stdlib.h>
+ #include <stdio.h>
+ #include <string.h>
++#include <time.h>
+ #include <unistd.h>
+-#include <sys/types.h>
+-#include <sys/utsname.h>
+-#include <sys/socket.h>
+-#include <sys/un.h>
+ #include <pci/pci.h>
+ 
+ #include "traceevent/event-parse.h"
++#include "ras-kmsg.h"
+ #include "ras-report.h"
++#include "ras-time.h"
+ 
+ #define NONE ""
+ int json_report = 1;
+@@ -236,3 +236,63 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev)
+ 			 ev->apicid);
+ }
+ 
++#ifdef HAVE_KMSG_MONITOR
++void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg)
++{
++	struct trace_seq seq;
++	int e, s;
++	int group_count = kmsg_tracer->group_count;
++	regmatch_t *matches = kmsg_tracer->matches;
++	char tmpbuf[256] = {0}, timestamp[64] = {0};
++	char pci_name[128] = {0}, *key;
++	u16 vendor, device;
++
++	get_kmsg_time(msg, timestamp);
++
++	trace_seq_init(&seq);
++	trace_seq_printf(&seq, "\n{ \"%s\": \"%s\", ", JSON_REPORT_KEY, kmsg_tracer->name);
++	trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp);
++
++	for (int j = 1; j < group_count; j++) {
++		s = matches[j].rm_so;
++		e = matches[j].rm_eo;
++		key = kmsg_tracer->group_key[j - 1];
++
++		if (s < 0)
++			trace_seq_printf(&seq, "\"%s\": \"\", ", key);
++		else
++			trace_seq_printf(&seq, "\"%s\": \"%.*s\"%s ",
++					 key,
++					 (int)(e - s), msg + s,
++					 (j == group_count - 1) ? "" : ",");
++
++		if (!strcmp("pci_port", kmsg_tracer->group_key[j - 1])) {
++			snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s);
++			get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device);
++			trace_seq_printf(&seq, "\"pci_dev_name\": \"%s\", ", pci_name);
++			trace_seq_printf(&seq, "\"vendor_id\": \"%#x\", ", vendor);
++			trace_seq_printf(&seq, "\"device_id\": \"%#x\", ", device);
++		}
++
++		if (!strcmp("xid", key) ||
++		    !strcmp("sxid", key) ||
++		    !strcmp("axid", key)) {
++			int xid;
++
++			snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s);
++			xid = (int)strtol(tmpbuf, NULL, 10);
++			for (int i = 0; i < kmsg_tracer->info.xid.len; i++) {
++				if (xid == kmsg_tracer->info.xid.ignore_xid[i])
++					goto out;
++			}
++		}
++	}
++
++	trace_seq_puts(&seq, "}");
++	trace_seq_do_printf(&seq);
++	printf("\n");
++out:
++	fflush(stdout);
++	trace_seq_destroy(&seq);
++}
++#endif
+diff --git a/ras-report.h b/ras-report.h
+index eeb25bb..0564992 100644
+--- a/ras-report.h
++++ b/ras-report.h
+@@ -13,6 +13,7 @@
+ #include "ras-mc-handler.h"
+ #include "ras-record.h"
+ #include "types.h"
++#include "ras-kmsg.h"
+ 
+ /* Maximal length of backtrace. */
+ #define MAX_BACKTRACE_SIZE (1024 * 1024)
+@@ -127,6 +128,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev);
+ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev);
+ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
+ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
++void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
+ #endif
+ 
+ #endif
+diff --git a/ras-time.c b/ras-time.c
+new file mode 100644
+index 0000000..320f1a1
+--- /dev/null
++++ b/ras-time.c
+@@ -0,0 +1,103 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#define _GNU_SOURCE
++#include <inttypes.h>
++#include <stdint.h>
++#include <string.h>
++#include <errno.h>
++#include <signal.h>
++#include <time.h>
++#ifdef HAVE_SYSINFO
++#include <sys/sysinfo.h>
++#endif
++
++#include "ras-time.h"
++
++struct timeval boot_time;
++time_t suspended_time;
++
++int get_boot_time(struct timeval *boot_time)
++{
++#ifdef CLOCK_BOOTTIME
++	struct timespec hires_uptime;
++	struct timeval lores_uptime;
++#endif
++	struct timeval now;
++#ifdef HAVE_SYSINFO
++	struct sysinfo info;
++#endif
++
++	if (gettimeofday(&now, NULL) != 0)
++		return -errno;
++#ifdef CLOCK_BOOTTIME
++	if (clock_gettime(CLOCK_BOOTTIME, &hires_uptime) == 0) {
++		TIMESPEC_TO_TIMEVAL(&lores_uptime, &hires_uptime);
++		timersub(&now, &lores_uptime, boot_time);
++		return 0;
++	}
++#endif
++#ifdef HAVE_SYSINFO
++	/* fallback */
++	if (sysinfo(&info) != 0)
++		return -errno;
++
++	boot_time->tv_sec = now.tv_sec - info.uptime;
++	boot_time->tv_usec = 0;
++	return 0;
++#else
++	return -ENOSYS;
++#endif
++}
++
++time_t get_suspended_time(void)
++{
++#if defined(CLOCK_BOOTTIME) && defined(CLOCK_MONOTONIC)
++	struct timespec boot, mono;
++
++	if (clock_gettime(CLOCK_BOOTTIME, &boot) == 0 &&
++	    clock_gettime(CLOCK_MONOTONIC, &mono) == 0)
++		return boot.tv_sec - mono.tv_sec;
++#endif
++	return 0;
++}
++
++const char *skip_item(const char *begin, const char *end, const char *sep)
++{
++	while (begin < end) {
++		int c = *begin++;
++
++		if (c == '\0' || strchr(sep, c))
++			break;
++	}
++
++	return begin;
++}
++
++void get_kmsg_time(const char *msg, char *timestamp)
++{
++	const char *p = msg, *end;
++	char *nu = NULL;
++	uint64_t usec;
++	struct timeval tv = { 0 };
++	time_t t;
++	struct tm *tm;
++
++	end = msg + strlen(msg) - 1;
++
++	p = skip_item(p, end, ",");
++	p = skip_item(p, end, ",;");
++
++	errno = 0;
++	usec = strtoumax(p, &nu, 10);
++
++	if (!errno && nu && (*nu == ';' || *nu == ',')) {
++		tv.tv_usec = usec % 1000000;
++		tv.tv_sec = usec / 1000000;
++		t = boot_time.tv_sec + suspended_time + tv.tv_sec;
++	} else {
++		t = time(NULL);
++	}
++	tm = localtime(&t);
++
++	strftime(timestamp, 64, "%Y-%m-%d %H:%M:%S %z", tm);
++}
+diff --git a/ras-time.h b/ras-time.h
+new file mode 100644
+index 0000000..5dabae8
+--- /dev/null
++++ b/ras-time.h
+@@ -0,0 +1,27 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#ifndef RAS_TIME_H
++#define RAS_TIME_H
++
++# ifdef CLOCK_MONOTONIC_RAW
++#  define UL_CLOCK_MONOTONIC	CLOCK_MONOTONIC_RAW
++# else
++#  define UL_CLOCK_MONOTONIC	CLOCK_MONOTONIC
++# endif
++
++#include <sys/time.h>
++
++extern struct timeval boot_time;
++extern time_t suspended_time;
++
++int get_boot_time(struct timeval *boot_time);
++
++time_t get_suspended_time(void);
++
++int gettime_monotonic(struct timeval *tv);
++
++const char *skip_item(const char *begin, const char *end, const char *sep);
++
++void get_kmsg_time(const char *msg, char *timestamp);
++
++#endif /* RAS_TIME_H */
+diff --git a/rasdaemon.c b/rasdaemon.c
+index d5d2f85..30dcaf4 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -14,6 +14,8 @@
+ #include "ras-erst.h"
+ #include "ras-events.h"
+ #include "ras-logger.h"
++#include "ras-kmsg.h"
++#include "ras-time.h"
+ #include "ras-poison-page-stat.h"
+ #include "ras-record.h"
+ #include "ras-report.h"
+@@ -154,6 +156,13 @@ int main(int argc, char *argv[])
+ 		json_report = 0;
+ #endif
+ 
++#ifdef HAVE_KMSG_MONITOR
++	if (choices_disable &&
++	    strlen(choices_disable) != 0 &&
++	    strstr(choices_disable, "kmsg_monitor"))
++		kmsg_monitor = 0;
++#endif
++
+ #ifdef HAVE_MCE
+ 	const struct argp_option offline_options[] = {
+ 		{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+@@ -271,6 +280,11 @@ int main(int argc, char *argv[])
+ 		log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n");
+ 	}
+ #endif
++#ifdef HAVE_KMSG_MONITOR
++	get_boot_time(&boot_time);
++	suspended_time = get_suspended_time();
++#endif
++
+ 	handle_ras_events(args.record_events, args.enable_ipmitool);
+ 
+ #ifdef HAVE_NVGPU
+diff --git a/trigger.c b/trigger.c
+index 7387113..d410137 100644
+--- a/trigger.c
++++ b/trigger.c
+@@ -99,6 +99,8 @@ struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"};
+ struct event_trigger pre_page_offline_trigger = {"page_offline", "PRE_PAGE_OFFLINE_TRIGGER"};
+ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFFLINE_TRIGGER"};
+ 
++struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"};
++
+ static struct event_trigger *event_triggers[] = {
+ 	&mc_ue_trigger,
+ #ifdef HAVE_MCE
+@@ -117,6 +119,9 @@ static struct event_trigger *event_triggers[] = {
+ 	&pre_page_offline_trigger,
+ 	&post_page_offline_trigger,
+ #endif
++#ifdef HAVE_KMSG_MONITOR
++	&kmsg_trigger,
++#endif
+ };
+ 
+ void setup_event_trigger(const char *event)
+@@ -421,3 +426,53 @@ void run_page_offline_trigger(unsigned long long addr, int otype, int type)
+ 		__run_page_offline_trigger(addr, otype, &pre_page_offline_trigger);
+ }
+ 
++void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg)
++{
++	char *env[MAX_ENV], *key;
++	int ei = 0;
++	int e, s;
++	int group_count = kmsg_tracer->group_count;
++	regmatch_t *matches = kmsg_tracer->matches;
++	struct event_trigger *trigger = &kmsg_trigger;
++	char tmpbuf[128];
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++
++	for (int j = 1; j < group_count; j++) {
++		s = matches[j].rm_so;
++		e = matches[j].rm_eo;
++		key = kmsg_tracer->group_key[j - 1];
++
++		if (s >= 0)
++			if (asprintf(&env[ei++], "%s=%.*s",
++				     key, (int)(e - s), msg + s) < 0)
++				goto free;
++
++		if (!strcmp("xid", key) ||
++		    !strcmp("sxid", key) ||
++		    !strcmp("axid", key)) {
++			int xid;
++
++			snprintf(tmpbuf, 128, "%.*s", (int)(e - s), msg + s);
++			xid = (int)strtol(tmpbuf, NULL, 10);
++			for (int i = 0; i < kmsg_tracer->info.xid.len; i++) {
++				if (xid == kmsg_tracer->info.xid.ignore_xid[i])
++					goto free;
++			}
++		}
++	}
++
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (int i = 0; i < ei; i++)
++		free(env[i]);
++}
++
+diff --git a/trigger.h b/trigger.h
+index 74df3d3..b5a6c2c 100644
+--- a/trigger.h
++++ b/trigger.h
+@@ -4,6 +4,7 @@
+ #define __TRIGGER_H__
+ 
+ #include "ras-record.h"
++#include "ras-kmsg.h"
+ 
+ enum page_offline_trigger_type {
+ 	PRE,
+@@ -27,5 +28,7 @@ void run_mce_record_trigger(struct mce_event *e);
+ void run_mf_event_trigger(struct ras_mf_event *e);
+ void run_aer_event_trigger(struct ras_aer_event *e);
+ void run_page_offline_trigger(unsigned long long addr, int otype, int type);
++void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
++
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1919377ade51d0e1a4a70b8e0c728124e88e4e11
--- /dev/null
+++ b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch
@@ -0,0 +1,1164 @@
+From 29c769fa59e73a016aea891476caea98fbf3a27d Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 12 Dec 2024 09:37:06 +0800
+Subject: [PATCH 19/30] rasdaemon: erst: add erst-mce erst-dmesg
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am          |   4 +-
+ configure.ac         |   4 +
+ ras-erst-dmesg.c     | 875 +++++++++++++++++++++++++++++++++++++++++++
+ ras-erst.c           |  18 +-
+ ras-erst.h           |   7 +
+ ras-record.h         |   1 +
+ ras-report-json.c    |  29 +-
+ ras-report.h         |   1 +
+ ras-signal-handler.c |   3 +
+ rasdaemon.c          |   2 -
+ 10 files changed, 932 insertions(+), 12 deletions(-)
+ create mode 100644 ras-erst-dmesg.c
+
+diff --git a/Makefile.am b/Makefile.am
+index 68b354b..da6ef46 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -142,7 +142,7 @@ if WITH_POISON_PAGE_STAT
+    rasdaemon_SOURCES += ras-poison-page-stat.c
+ endif
+ if WITH_ERST
+-   rasdaemon_SOURCES += ras-erst.c
++   rasdaemon_SOURCES += ras-erst.c ras-erst-dmesg.c
+ endif
+ 
+ if WITH_NVGPU
+@@ -152,7 +152,7 @@ ras-nvgpu-nvml.h: contrib/nvml.py
+    rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c
+ endif
+ 
+-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl
++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS)
+ rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS)
+ 
+ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+diff --git a/configure.ac b/configure.ac
+index dfb7f02..68fcb75 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -287,12 +287,16 @@ AC_ARG_ENABLE([erst],
+     AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)]))
+ 
+ AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [
++  AC_CHECK_LIB(z, inflate,[echo "found zlib"] , AC_MSG_ERROR([*** Unable to find zlib library]), )
++  ZLIBS="-lz"
+   AC_DEFINE(HAVE_ERST,1,"have ERST")
+   AC_SUBST([WITH_ERST])
+ ])
+ AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"])
+ 
++AC_SUBST([ZLIBS])
++
+ AC_ARG_ENABLE([nvgpu],
+     AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events]))
+ 
+diff --git a/ras-erst-dmesg.c b/ras-erst-dmesg.c
+new file mode 100644
+index 0000000..ce61a6a
+--- /dev/null
++++ b/ras-erst-dmesg.c
+@@ -0,0 +1,875 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++/*
++* Copyright (C) 2025 Alibaba Inc
++*/
++
++#include <dirent.h>
++#include <stdlib.h>
++#include <sys/stat.h>
++#include <unistd.h>
++#include <utmp.h>
++#include <zlib.h>
++
++#include "bitfield.h"
++#include "ras-events.h"
++#include "ras-erst.h"
++#include "ras-logger.h"
++#include "ras-mce-handler.h"
++#include "ras-record.h"
++#include "ras-report.h"
++#include "types.h"
++
++struct apei_regex {
++	regex_t hdr;
++	regex_t severity;
++	regex_t error;
++	regex_t fru;
++	regex_t type;
++
++	regex_t addr;
++	regex_t loc;
++	regex_t mem_type;
++	regex_t mem_status;
++
++	regex_t port_type;
++	regex_t port;
++	regex_t id;
++	regex_t status;
++	regex_t aer_sev;
++	regex_t tlp_hdr;
++
++	regex_t cpu_id;
++
++	regex_t midr;
++	regex_t mpidr;
++};
++
++enum {
++	APEI_NONE,
++	APEI_CPU,
++	APEI_MEM,
++	APEI_PCIE,
++	APEI_ARM,
++};
++
++struct apei {
++	int id;
++	int sev;
++	int err_id;
++	char *fru;
++	int type;
++	time_t time;
++	union {
++		struct {
++			uint64_t addr;
++			char *loc;
++			char *status;
++			char *type;
++		} mem;
++		struct {
++			int port_type;
++			char *port;
++			char *vendor_id;
++			char *device_id;
++			char *status;
++			char *mask;
++			char *sev;
++			char *tlp_hdr;
++		} pcie;
++		struct {
++			char *cpu_id;
++		} cpu;
++		struct {
++			char *midr;
++			char *mpidr;
++		} arm;
++	};
++};
++
++time_t last_reboot_time;
++
++static void get_last_reboot_time(void)
++{
++	struct utmp record;
++	int fd;
++	int reboots_found = 0;
++	time_t reboot_times;
++
++	fd = open("/var/log/wtmp", O_RDONLY);
++	if (fd == -1) {
++		log(ALL, LOG_ERR, "Error opening wtmp file");
++		return;
++	}
++
++	if (lseek(fd, -1 * sizeof(struct utmp), SEEK_END) == -1) {
++		perror("Error seeking in wtmp file");
++		close(fd);
++		return;
++	}
++
++	while (reboots_found < LAST_REBOOT_INDEX) {
++		if (read(fd, &record, sizeof(struct utmp)) != sizeof(struct utmp)) {
++			perror("Error reading wtmp file");
++			close(fd);
++			return;
++		}
++
++		if (strncmp(record.ut_line, "~", 1) == 0) {
++			if (strncmp(record.ut_user, "reboot", 6) == 0) {
++				reboot_times = record.ut_tv.tv_sec;
++				reboots_found++;
++			}
++		}
++
++		if (lseek(fd, -2 * sizeof(struct utmp), SEEK_CUR) == -1) {
++			reboot_times = 0;
++			break;
++		}
++	}
++
++	close(fd);
++
++	last_reboot_time = reboot_times;
++
++	return;
++}
++
++#define DMESG_ERST_PREFIX "dmesg-erst"
++#define DMESG_ERST_SUFFIX "enc.z"
++
++#define APEI_HEADER	".*\\[(.*).[0-9]+\\] \\{([0-9]+)\\}\\[Hardware Error\\]: Hardware error from APEI Generic Hardware Error Source:.*"
++#define APEI_SEVERITY	".*\\{([0-9]+)\\}\\[Hardware Error\\]: event severity: (.*)"
++#define APEI_ERROR	".*\\{([0-9]+)\\}\\[Hardware Error\\]:  Error ([0-9]+), type: (.*)"
++#define APEI_MEM_FRU	".*\\{([0-9]+)\\}\\[Hardware Error\\]:  fru_text: (.*)"
++#define APEI_TYPE	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   section_type: (.*)"
++
++// MEM
++#define APEI_MEM_ADDR	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   physical_address: (.*)"
++#define APEI_MEM_LOC	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   (node:.*)"
++#define APEI_MEM_TYPE	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   error_type: [0-9]+, (.*)"
++#define APEI_MEM_STATUS	".*\\{([0-9]+)\\}\\[Hardware Error\\]:.*error_status: (.*) \\(.*\\)"
++
++// PCIE
++#define APEI_PORT_TYPE	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   port_type: ([0-9]+), (.*)"
++#define APEI_PORT	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   device_id: (.*)"
++#define APEI_ID		".*\\{([0-9]+)\\}\\[Hardware Error\\]:   vendor_id: (.*), device_id: (.*)"
++#define APEI_STATUS	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   aer_uncor_status: (.*), aer_uncor_mask: (.*)"
++#define APEI_AER_SEVE	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   aer_uncor_severity: (.*)"
++#define APEI_TLP_HDR	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   TLP Header: (.*)"
++
++#define APEI_CPU_ID	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   processor_id: (.*)"
++
++#define APEI_ARM_MIDR	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   MIDR: (.*)"
++#define APEI_ARM_MPIDR	".*\\{([0-9]+)\\}\\[Hardware Error\\]:   Multiprocessor Affinity Register \\(MPIDR\\): (.*)"
++
++static int decompress_deflate(const char *compressed_data, ssize_t compressed_data_size,
++			char *decompressed_data, ssize_t *decompressed_data_size, z_stream *zstream)
++{
++	int ret = Z_OK;
++
++	ret = inflateReset2(zstream, -MAX_WBITS);
++	if (ret != Z_OK)
++		return ret;
++
++	zstream->next_in = (Bytef *)compressed_data;
++	zstream->avail_in = compressed_data_size;
++	zstream->next_out = (Bytef *)decompressed_data;
++	zstream->avail_out = *decompressed_data_size;
++
++	ret = inflate(zstream, Z_FINISH);
++	if (ret != Z_STREAM_END)
++		return Z_DATA_ERROR;
++
++	*decompressed_data_size = zstream->total_out;
++
++	return ret;
++}
++
++static void apei_report_mem(struct trace_seq *s, struct apei *apei)
++{
++	struct ras_mc_event ev = {0};
++	char msg_buf[400];
++	time_t t;
++	struct tm *tm;
++
++	ev.erst = 1;
++	if (!apei->time)
++		t = time(NULL);
++	else
++		t = apei->time;
++
++	tm = localtime(&t);
++	if (tm)
++		strftime(ev.timestamp, sizeof(ev.timestamp),
++			"%Y-%m-%d %H:%M:%S %z", tm);
++
++	ev.error_count = 1;
++	ev.grain = 1;
++	ev.top_layer = -1;
++	ev.middle_layer = -1;
++	ev.lower_layer = -1;
++
++	switch (apei->sev) {
++	case GHES_SEV_CORRECTED:
++		ev.error_type = "Corrected";
++		break;
++	case GHES_SEV_RECOVERABLE:
++		ev.error_type = "Uncorrected";
++		break;
++	case GHES_SEV_PANIC:
++		ev.error_type = "Fatal";
++		break;
++	default:
++		ev.error_type = "Info";
++	}
++	ev.severity = apei->sev;
++
++	snprintf(msg_buf, 400, "APEI location: %s status(0x00000000): %s",
++						apei->mem.loc,
++						apei->mem.status ? apei->mem.status : "");
++	ev.driver_detail = msg_buf;
++
++	ev.address = apei->mem.addr;
++	ev.mc_index = 0;
++
++#ifdef HAVE_JSON_REPORT
++	report_mc_event_json(s, &ev);
++#endif
++}
++
++/* bit field meaning for correctable error */
++static const char *aer_cor_errors[32] = {
++	/* Correctable errors */
++	[0]  = "Receiver Error",
++	[6]  = "Bad TLP",
++	[7]  = "Bad DLLP",
++	[8]  = "RELAY_NUM Rollover",
++	[12] = "Replay Timer Timeout",
++	[13] = "Advisory Non-Fatal",
++	[14] = "Corrected Internal Error",
++};
++
++/* bit field meaning for uncorrectable error */
++static const char *aer_uncor_errors[32] = {
++	/* Uncorrectable errors */
++	[4]  = "Data Link Protocol",
++	[12] = "Poisoned TLP",
++	[13] = "Flow Control Protocol",
++	[14] = "Completion Timeout",
++	[15] = "Completer Abort",
++	[16] = "Unexpected Completion",
++	[17] = "Receiver Overflow",
++	[18] = "Malformed TLP",
++	[19] = "ECRC",
++	[20] = "Unsupported Request",
++};
++
++static void apei_report_pcie(struct trace_seq *s, struct apei *apei)
++{
++	struct ras_aer_event ev = {0};
++	unsigned long long status_val;
++	char buf[1024];
++	time_t t;
++	struct tm *tm;
++
++	ev.erst = 1;
++	if (!apei->time)
++		t = time(NULL);
++	else
++		t = apei->time;
++
++	tm = localtime(&t);
++	if (tm)
++		strftime(ev.timestamp, sizeof(ev.timestamp),
++			"%Y-%m-%d %H:%M:%S %z", tm);
++
++	ev.dev_name = apei->pcie.port;
++	ev.vendor_id = strtoul(apei->pcie.vendor_id, NULL, 16);
++	ev.device_id = strtoul(apei->pcie.device_id, NULL, 16);
++
++	if (apei->pcie.status) {
++		status_val = strtoull(apei->pcie.status, NULL, 16);
++
++		if (apei->sev == GHES_SEV_CORRECTED)
++			bitfield_msg(buf, sizeof(buf), aer_cor_errors, 32, 0, 0, status_val);
++		else
++			bitfield_msg(buf, sizeof(buf), aer_uncor_errors, 32, 0, 0, status_val);
++	} else {
++		snprintf(buf, 1024, "no status");
++	}
++	ev.msg = buf;
++
++	ev.tlp_header_valid = (apei->pcie.tlp_hdr != NULL);
++	if (ev.tlp_header_valid)
++		snprintf((buf + strlen(ev.msg)), 1024 - strlen(ev.msg),
++			" TLP Header: %s", apei->pcie.tlp_hdr);
++
++	ev.severity = apei->sev;
++	switch (apei->sev) {
++	case GHES_SEV_RECOVERABLE:
++		ev.error_type = "Uncorrected (Non-Fatal)";
++		break;
++	case GHES_SEV_PANIC:
++		ev.error_type = "Uncorrected (Fatal)";
++		break;
++	case GHES_SEV_CORRECTED:
++		ev.error_type = "Corrected";
++		break;
++	default:
++		ev.error_type = "Unknown severity";
++	}
++
++#ifdef HAVE_JSON_REPORT
++	report_aer_event_json(s, &ev);
++#endif
++}
++
++static void report_apei(struct apei *apei)
++{
++	struct trace_seq seq;
++	time_t t;
++	struct tm *tm;
++	char timestamp[64];
++
++	if (!apei->type)
++		return;
++
++	trace_seq_init(&seq);
++	//trace_seq_printf(&seq, "{ \"event_name\": \"%s\", ", ERST_PANIC_NAME);
++
++	switch (apei->type) {
++	case APEI_MEM:
++		apei_report_mem(&seq, apei);
++		break;
++	case APEI_PCIE:
++		apei_report_pcie(&seq, apei);
++		break;
++	case APEI_CPU:
++		if (!apei->time)
++			t = time(NULL);
++		else
++			t = apei->time;
++
++		tm = localtime(&t);
++		if (tm)
++			strftime(timestamp, sizeof(timestamp),
++				"%Y-%m-%d %H:%M:%S %z", tm);
++
++		trace_seq_printf(&seq, "{ \"%s\": \"%s\", ", JSON_REPORT_KEY, "erst_cpu");
++		trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp);
++		trace_seq_printf(&seq, "\"fru\": \"%s\", ", apei->fru ? apei->fru : "");
++		trace_seq_printf(&seq, "\"severity\": \"%s\", ", severity_strs[apei->sev]);
++		trace_seq_printf(&seq, "\"cpu_id\": \"%s\" ", apei->cpu.cpu_id ? apei->cpu.cpu_id : "");
++		trace_seq_puts(&seq, "}");
++		break;
++	case APEI_ARM:
++		if (!apei->time)
++			t = time(NULL);
++		else
++			t = apei->time;
++
++		tm = localtime(&t);
++		if (tm)
++			strftime(timestamp, sizeof(timestamp),
++				"%Y-%m-%d %H:%M:%S %z", tm);
++
++		trace_seq_printf(&seq, "{ \"%s\": \"%s\", ", JSON_REPORT_KEY, "erst_arm_cpu");
++		trace_seq_printf(&seq, "\"timestamp\": \"%s\", ", timestamp);
++		trace_seq_printf(&seq, "\"fru\": \"%s\", ", apei->fru ? apei->fru : "");
++		trace_seq_printf(&seq, "\"severity\": \"%s\", ", severity_strs[apei->sev]);
++		trace_seq_printf(&seq, "\"midr\": \"%s\" ", apei->arm.midr ? apei->arm.midr : "");
++		trace_seq_printf(&seq, "\"mpidr\": \"%s\" ", apei->arm.mpidr ? apei->arm.mpidr : "");
++		trace_seq_puts(&seq, "}");
++		break;
++	}
++
++	//trace_seq_puts(&seq, "}");
++	trace_seq_do_printf(&seq);
++	printf("\n");
++	fflush(stdout);
++	trace_seq_destroy(&seq);
++
++	memset(apei, 0, sizeof(*apei));
++	apei->err_id = -1;
++}
++
++static int is_compressed_file(const char *name)
++{
++	char buf[32];
++
++	snprintf(buf, sizeof(buf), "%s", name + strlen(name) - strlen(DMESG_ERST_SUFFIX));
++	return strncmp(buf, DMESG_ERST_SUFFIX, sizeof(DMESG_ERST_SUFFIX)) == 0;
++}
++
++static int line_is_panic_part1(char *line)
++{
++	int count, part;
++
++	if (sscanf(line, "Panic#%d Part%u", &count, &part) != 2)
++		return 0;
++
++	return part == 1;
++}
++
++static int compressed_file_is_panic_part1(char *buf, const char *name, z_stream *zstream)
++{
++	ssize_t out_size = 0;
++	char out_buf[128], *line;
++
++	if (decompress_deflate(buf, strlen(buf), out_buf, &out_size, zstream))
++		return 0;
++
++	line = strtok(out_buf, "\n");
++
++	return line_is_panic_part1(line);
++}
++
++static int file_is_panic_part1(FILE *file, const char *name, z_stream *zstream)
++{
++	char line[32];
++
++	if (!fgets(line, 32, file))
++		return 0;
++
++	if (is_compressed_file(name))
++		return compressed_file_is_panic_part1(line, name, zstream);
++
++	return line_is_panic_part1(line);
++
++}
++
++static void regex_group(regmatch_t *m, int i, const char *line, char *buf)
++{
++	int e, s;
++
++	s = m[i].rm_so;
++	e = m[i].rm_eo;
++	if (s >= 0)
++		snprintf(buf, e - s + 1, "%s", line + s);
++	else
++		buf = NULL;
++}
++
++static int dmesg_erst_line_process(const char *line, struct apei_regex *regex, struct apei *apei)
++{
++	int ret, err_id = 0, apei_id = 0;
++	regmatch_t matches[4];
++	char buf[128];
++	regex_t *re;
++	time_t t;
++
++	ret = regexec(re = &regex->hdr, line, 4, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, line, buf);
++		apei_id = atoi(buf);
++
++		if (apei->id && apei_id != apei->id)
++			report_apei(apei);
++		apei->id = apei_id;
++
++		regex_group(matches, 1, line, buf);
++		t = atoll(buf);
++
++		if (last_reboot_time)
++			apei->time = last_reboot_time + t;
++		else
++			apei->time = 0;
++
++		return 0;
++	}
++
++	ret = regexec(re = &regex->error, line, 4, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, line, buf);
++		err_id = atoi(buf);
++
++		if (apei->err_id != -1 && err_id != apei->err_id)
++			report_apei(apei);
++
++		apei->err_id = err_id;
++
++		regex_group(matches, 3, line, buf);
++		if (!strcmp("corrected", buf))
++			apei->sev = GHES_SEV_CORRECTED;
++		else if (!strcmp("recoverable", buf))
++			apei->sev = GHES_SEV_RECOVERABLE;
++		else if (!strcmp("fatal", buf))
++			apei->sev = GHES_SEV_PANIC;
++		else
++			apei->sev = GHES_SEV_NO;
++		return 0;
++	}
++
++	if (!apei->type) {
++		ret = regexec(re = &regex->type, line, 4, matches, 0);
++		if (ret)
++			goto error;
++
++		regex_group(matches, 2, line, buf);
++		if (!strcmp("general processor error", buf))
++			apei->type = APEI_CPU;
++		else if (!strcmp("memory error", buf))
++			apei->type = APEI_MEM;
++		else if (!strcmp("PCIe error", buf))
++			apei->type = APEI_PCIE;
++		else if (!strcmp("ARM processor error", buf))
++			apei->type = APEI_ARM;
++		else
++			apei->type = APEI_NONE;
++
++		return 0;
++	}
++
++	switch (apei->type) {
++	case APEI_CPU:
++		ret = regexec(re = &regex->cpu_id, line, 4, matches, 0);
++		if (ret)
++			goto error;
++		regex_group(matches, 2, line, buf);
++		apei->cpu.cpu_id = strdup(buf);
++
++		return 0;
++	case APEI_ARM:
++		if (!apei->arm.midr) {
++			ret = regexec(re = &regex->midr, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->arm.midr = strdup(buf);
++
++				return 0;
++			}
++		}
++
++		if (!apei->arm.mpidr) {
++			ret = regexec(re = &regex->mpidr, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->arm.mpidr = strdup(buf);
++
++				return 0;
++			}
++		}
++
++		return 0;
++	case APEI_MEM:
++		if (!apei->mem.addr) {
++			ret = regexec(re = &regex->addr, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->mem.addr = strtoull(buf, NULL, 16);
++				return 0;
++			}
++		}
++
++		if (!apei->mem.loc) {
++			ret = regexec(re = &regex->loc, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->mem.loc = strdup(buf);
++				return 0;
++			}
++		}
++
++		if (!apei->mem.type) {
++			ret = regexec(re = &regex->mem_type, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->mem.type = strdup(buf);
++				return 0;
++			}
++		}
++
++		if (!apei->mem.status) {
++			ret = regexec(re = &regex->mem_status, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->mem.status = strdup(buf);
++				return 0;
++			}
++		}
++
++	case APEI_PCIE:
++		//port type
++		ret = regexec(re = &regex->port_type, line, 4, matches, 0);
++		if (ret > REG_NOMATCH) {
++			goto error;
++		} else if (!ret) {
++			regex_group(matches, 2, line, buf);
++			apei->pcie.port_type = atoi(buf);
++
++			return 0;
++		}
++
++		// port
++		if (!apei->pcie.port) {
++			ret = regexec(re = &regex->port, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->pcie.port = strdup(buf);
++				return 0;
++			}
++		}
++
++		// vendor id device id
++		if (!apei->pcie.vendor_id) {
++			ret = regexec(&regex->id, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->pcie.vendor_id = strdup(buf);
++				regex_group(matches, 3, line, buf);
++				apei->pcie.device_id = strdup(buf);
++
++				return 0;
++			}
++		}
++
++		// status
++		if (!apei->pcie.status) {
++			ret = regexec(re = &regex->status, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->pcie.status = strdup(buf);
++				regex_group(matches, 3, line, buf);
++				apei->pcie.mask = strdup(buf);
++
++				return 0;
++			}
++		}
++
++		// aer sev
++		if (!apei->pcie.sev) {
++			ret = regexec(re = &regex->aer_sev, line, 4, matches, 0);
++			if (ret > REG_NOMATCH) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->pcie.sev = strdup(buf);
++
++				return 0;
++			}
++		}
++
++		// tlp hdr
++		if (!apei->pcie.tlp_hdr) {
++			ret = regexec(re = &regex->tlp_hdr, line, 4, matches, 0);
++			if (ret) {
++				goto error;
++			} else if (!ret) {
++				regex_group(matches, 2, line, buf);
++				apei->pcie.tlp_hdr = strdup(buf);
++
++				return 0;
++			}
++		}
++	}
++
++error:
++	if (ret == REG_NOMATCH)
++		return 0;
++	regerror(ret, re, buf, sizeof(buf));
++	printf("Regex execution error: %s\n", buf);
++	return ret;
++}
++
++static int handle_erst_dmesg(FILE *file, const char *name, z_stream *zstream, struct apei_regex *regex)
++{
++	long fileSize;
++	char *file_buf, *line, *out_data = NULL;
++	ssize_t out_max_size, out_data_size = 0, bytesRead;
++	int ret = 0, line_number = 1;
++	struct apei apei = {0};
++
++	apei.err_id = -1;
++
++	if (!file_is_panic_part1(file, name, zstream))
++		return -1;
++
++	if (fseek(file, 0, SEEK_END) != 0)
++		return -1;
++
++	fileSize = ftell(file);
++	if (fileSize == -1)
++		return -1;
++
++	file_buf = (char *)malloc(fileSize + 1);
++	if (!file_buf)
++		return -1;
++
++	rewind(file);
++	bytesRead = fread(file_buf, 1, fileSize, file);
++	if (bytesRead != fileSize) {
++		ret = -1;
++		goto free_file;
++	}
++	file_buf[fileSize] = '\0';
++
++	if (is_compressed_file(name)) {
++		out_max_size = fileSize * 3;
++		out_data = (char *)malloc(out_max_size);
++		if (!out_data) {
++			ret = -1;
++			goto free_file;
++		}
++
++		ret = decompress_deflate(file_buf, fileSize, out_data, &out_data_size, zstream);
++		if (ret)
++			goto free_out;
++
++		file_buf = out_data;
++	}
++
++	line = strtok(file_buf, "\n");
++
++	while (line) {
++		dmesg_erst_line_process(line, regex, &apei);
++
++		line = strtok(NULL, "\n");
++		line_number++;
++	}
++
++	report_apei(&apei);
++
++free_out:
++	if (out_data)
++		free(out_data);
++free_file:
++	free(file_buf);
++
++	return ret;
++}
++
++static int init_reg(regex_t *re, const char *str)
++{
++	char buf[128];
++	int ret = 0;
++
++	ret = regcomp(re, str, REG_EXTENDED);
++	if (ret) {
++		regerror(ret, re, buf, sizeof(buf));
++		printf("Regex execution error: %s\n", buf);
++		return ret;
++	}
++
++	return ret;
++}
++
++static void handle_erst_dmesg_file(const char *dir_name, const char *d_name, z_stream *zstream, struct apei_regex *regex)
++{
++	char file_path[512];
++	FILE *file;
++
++	if (strncmp(d_name, DMESG_ERST_PREFIX, strlen(DMESG_ERST_PREFIX)))
++		return;
++
++	snprintf(file_path, sizeof(file_path), "%s/%s", dir_name, d_name);
++
++	file = fopen(file_path, "r");
++	if (!file) {
++		log(ALL, LOG_INFO, "Failed to open file %s\n", file_path);
++		return;
++	}
++
++	handle_erst_dmesg(file, file_path, zstream, regex);
++
++	fclose(file);
++
++	if (erst_delete && unlink(file_path)) {
++		log(ALL, LOG_INFO, "Error deleting file %s\n", file_path);
++		return;
++	}
++}
++
++void handle_erst_panic(void)
++{
++	z_stream zstream = { 0 };
++	int rc = 0;
++	struct dirent *entry;
++	struct apei_regex regex;
++
++	if (!last_reboot_time)
++		get_last_reboot_time();
++
++	if (init_reg(&regex.hdr, APEI_HEADER) ||
++		init_reg(&regex.severity, APEI_SEVERITY) ||
++		init_reg(&regex.error, APEI_ERROR) ||
++		init_reg(&regex.fru, APEI_MEM_FRU) ||
++		init_reg(&regex.type, APEI_TYPE) ||
++		init_reg(&regex.addr, APEI_MEM_ADDR) ||
++		init_reg(&regex.loc, APEI_MEM_LOC) ||
++		init_reg(&regex.mem_type, APEI_MEM_TYPE) ||
++		init_reg(&regex.mem_status, APEI_MEM_STATUS) ||
++		init_reg(&regex.port_type, APEI_PORT_TYPE) ||
++		init_reg(&regex.port, APEI_PORT) ||
++		init_reg(&regex.id, APEI_ID) ||
++		init_reg(&regex.status, APEI_STATUS) ||
++		init_reg(&regex.aer_sev, APEI_AER_SEVE) ||
++		init_reg(&regex.tlp_hdr, APEI_TLP_HDR) ||
++		init_reg(&regex.cpu_id, APEI_CPU_ID) ||
++		init_reg(&regex.midr, APEI_ARM_MIDR) ||
++		init_reg(&regex.mpidr, APEI_ARM_MPIDR))
++		return;
++
++	DIR *dir = opendir(ERST_PATH);
++
++	if (!dir) {
++		log(ALL, LOG_INFO, "%s Failed to open directory %s\n", ERST_PATH, strerror(errno));
++		return;
++	}
++
++	inflateInit2(&zstream, -MAX_WBITS);
++	if (rc != Z_OK) {
++		log(ALL, LOG_INFO, "Failed to open init inflate %d\n", rc);
++		return;
++	}
++
++	while ((entry = readdir(dir)) != NULL) {
++		struct stat path_stat;
++		char file_path[MAX_PATH];
++
++		snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name);
++		stat(file_path, &path_stat);
++
++		if (S_ISDIR(path_stat.st_mode) && !strncmp("erst", entry->d_name, sizeof("erst"))) {
++			DIR *subdir = opendir(file_path);
++			struct dirent *subentry;
++
++			if (!subdir) {
++				log(ALL, LOG_INFO, "Failed to open directory %s\n", strerror(errno));
++				break;
++			}
++			while ((subentry = readdir(subdir)) != NULL)
++				handle_erst_dmesg_file(file_path, subentry->d_name, &zstream, &regex);
++
++			closedir(subdir);
++
++		} else
++			handle_erst_dmesg_file(ERST_PATH, entry->d_name, &zstream, &regex);
++	}
++
++	closedir(dir);
++
++	inflateEnd(&zstream);
++}
+diff --git a/ras-erst.c b/ras-erst.c
+index c024d60..a0ece1b 100644
+--- a/ras-erst.c
++++ b/ras-erst.c
+@@ -14,6 +14,8 @@
+ #include "ras-logger.h"
+ #include "ras-mce-handler.h"
+ #include "ras-record.h"
++#include "ras-report.h"
++#include "ras-time.h"
+ #include "types.h"
+ 
+ struct mce {
+@@ -43,11 +45,7 @@ struct mce {
+ 	uint32_t microcode;	/* Microcode revision */
+ };
+ 
+-static int erst_delete;
+-
+-#define ERST_PATH "/sys/fs/pstore/erst"
+-#define MCE_ERST_PREFIX "mce-erst"
+-#define ERST_EVENT_NAME "mce_erst_record"
++int erst_delete;
+ 
+ #ifdef HAVE_MCE
+ static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e)
+@@ -80,6 +78,9 @@ static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e)
+ 			 "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME);
+ 
+ 	report_mce_event(ras, NULL, &s, e);
++#ifdef HAVE_JSON_REPORT
++	report_mce_event_json(&s, e);
++#endif
+ 	trace_seq_terminate(&s);
+ 	trace_seq_do_printf(&s);
+ 	printf("\n");
+@@ -188,8 +189,15 @@ static void handle_erst_mce(void)
+ /* ERST just support mce now */
+ void handle_erst(void)
+ {
++	get_boot_time(&boot_time);
++	suspended_time = get_suspended_time();
++
+ 	if (getenv(ERST_DELETE))
+ 		erst_delete = atoi(getenv(ERST_DELETE));
+ 
++#ifdef HAVE_MCE
+ 	handle_erst_mce();
++#endif
++
++	handle_erst_panic();
+ }
+diff --git a/ras-erst.h b/ras-erst.h
+index 83d7535..29a5587 100644
+--- a/ras-erst.h
++++ b/ras-erst.h
+@@ -8,10 +8,17 @@
+ #define __RAS_ERST_H
+ 
+ #define ERST_DELETE	"ERST_DELETE"
++#define ERST_PATH "/sys/fs/pstore/erst"
++#define MCE_ERST_PREFIX "mce-erst"
++#define ERST_EVENT_NAME "mce_erst_record"
++#define ERST_PANIC_NAME "dmesg_erst_record"
++#define LAST_REBOOT_INDEX 2
+ 
++extern int erst_delete;
+ #ifdef HAVE_MCE
+ void handle_erst_mce(void);
+ #endif
+ 
+ void handle_erst(void);
++void handle_erst_panic(void);
+ #endif
+diff --git a/ras-record.h b/ras-record.h
+index 7f49b74..416f679 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -101,6 +101,7 @@ struct ras_arm_event {
+ 	uint64_t error_info;
+ 	uint64_t virt_fault_addr;
+ 	uint64_t phy_fault_addr;
++	int erst;
+ };
+ 
+ struct devlink_event {
+diff --git a/ras-report-json.c b/ras-report-json.c
+index 2d35355..e28cfac 100644
+--- a/ras-report-json.c
++++ b/ras-report-json.c
+@@ -45,6 +45,7 @@ void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev)
+ 			 "\"syndrome\": \"%#llx\", "
+ 			 "\"driver_detail\": \"%s\" }",
+ 			 JSON_REPORT_KEY,
++			 ev->erst ? "erst_mc_event" : "mc_event",
+ 			 (*ev->timestamp) ? ev->timestamp : NONE,
+ 			 severity_strs[ev->severity],
+ 			 ev->error_count,
+@@ -114,7 +115,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev)
+ 	get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device);
+ 
+ 	trace_seq_printf(s,
+-			 "\n{ \"%s\": \"aer_event\", "	\
++			 "\n{ \"%s\": \"%s\", "	\
+ 			 "\"timestamp\": \"%s\", "	\
+ 			 "\"severity\": \"%s\", "	\
+ 			 "\"error_type\": \"%s\", "	\
+@@ -124,12 +125,14 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev)
+ 			 "\"device_id\": \"%#x\", "	\
+ 			 "\"msg\": \"%s\" }",
+ 			 JSON_REPORT_KEY,
++			 ev->erst ? "erst_aer_event" : "aer_event",
+ 			 (*ev->timestamp) ? ev->timestamp : NONE,
+ 			 severity_strs[ev->severity],
+ 			 (ev->error_type) ? ev->error_type : NONE,
+ 			 (ev->dev_name) ? ev->dev_name : NONE,
+ 			 (*pci_name) ? pci_name : NONE,
+-			 vendor, device,
++			 ev->vendor_id ? ev->vendor_id : vendor,
++			 ev->device_id ? ev->device_id : device,
+ 			 (ev->msg) ? ev->msg : NONE);
+ }
+ 
+@@ -139,7 +142,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev)
+ 		return;
+ 
+ 	trace_seq_printf(s,
+-			 "\n{ \"%s\": \"arm_event\", "	\
++			 "\n{ \"%s\": \"%s\", "	\
+ 			 "\"timestamp\": \"%s\", "	\
+ 			 "\"error_count\": %d, "	\
+ 			 "\"affinity\": %d, "		\
+@@ -148,6 +151,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev)
+ 			 "\"running_state\": %d, "	\
+ 			 "\"psci_state\": %d }",
+ 			 JSON_REPORT_KEY,
++			 ev->erst ? "erst_arm_event" : "arm_event",
+ 			 (*ev->timestamp) ? ev->timestamp : NONE,
+ 			 ev->error_count,
+ 			 ev->affinity,
+@@ -173,6 +177,24 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev)
+ 			 (ev->action_result) ? ev->action_result : NONE);
+ }
+ 
++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev)
++{
++	if (!s || !ev || !json_report)
++		return;
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"signal_event\", \"timestamp\": \"%s\", " \
++			 "\"signo\": %d, \"sigerr\": %d, " \
++			 "\"sigcode\": %d, \"comm\": \"%s\", " \
++			 "\"pid\": %d, \"group\": %d, " \
++			 "\"result\": %d }",
++			 JSON_REPORT_KEY,
++			 (*ev->timestamp) ? ev->timestamp : NONE,
++			 ev->sig, ev->error_no, ev->code,
++			 (ev->comm) ? ev->comm : NONE,
++			 ev->pid, ev->group, ev->result);
++}
++
+ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev)
+ {
+ 	if (!s || !ev || !json_report)
+@@ -211,6 +233,7 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev)
+ 			 "\"mcgcap\": \"%#lx\", "
+ 			 "\"apicid\": \"%#x\" }",
+ 			 JSON_REPORT_KEY,
++			 ev->erst ? "erst_mce_record" : "mce_record",
+ 			 (*ev->timestamp) ? ev->timestamp : NONE,
+ 			 severity_strs[ev->severity],
+ 			 ev->bank,
+diff --git a/ras-report.h b/ras-report.h
+index 0564992..7f7f304 100644
+--- a/ras-report.h
++++ b/ras-report.h
+@@ -129,6 +129,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev);
+ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
+ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
+ void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev);
+ #endif
+ 
+ #endif
+diff --git a/ras-signal-handler.c b/ras-signal-handler.c
+index d15c4f6..0d999a6 100644
+--- a/ras-signal-handler.c
++++ b/ras-signal-handler.c
+@@ -130,6 +130,9 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record,
+ 
+ 	report_ras_signal_event(s, &ev);
+ 
++#ifdef HAVE_JSON_REPORT
++	report_signal_event_json(s, &ev);
++#endif
+ 	/* Store data into the SQLite DB */
+ #ifdef HAVE_SQLITE3
+ 	ras_store_signal_event(ras, &ev);
+diff --git a/rasdaemon.c b/rasdaemon.c
+index 30dcaf4..335c047 100644
+--- a/rasdaemon.c
++++ b/rasdaemon.c
+@@ -247,13 +247,11 @@ int main(int argc, char *argv[])
+ 			exit(EXIT_FAILURE);
+ 
+ #ifdef HAVE_ERST
+-#ifdef HAVE_MCE
+ 	if (choices_disable && strlen(choices_disable) != 0 &&
+ 	    strstr(choices_disable, "ras:erst"))
+ 		log(ALL, LOG_INFO, "Disabled ras:erst from config\n");
+ 	else
+ 		handle_erst();
+-#endif
+ #endif
+ 	if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE)))
+ 		config_pcie_edpc();
+-- 
+2.43.5
+
diff --git a/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch
new file mode 100644
index 0000000000000000000000000000000000000000..26de833c4c534b61ca8a37449dd805720f22228d
--- /dev/null
+++ b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch
@@ -0,0 +1,484 @@
+From e58b2e2c034ecfd6de044d8daee6d66a18b1ea3c Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Tue, 17 Dec 2024 09:36:55 +0800
+Subject: [PATCH 20/30] anolis: rasdaemon: add amdgpu ras error monitor
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |   2 +-
+ misc/rasdaemon.env |   1 +
+ ras-events.c       |   1 +
+ ras-kmsg-amdgpu.c  | 219 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-kmsg.c         |   4 +
+ ras-kmsg.h         |  25 ++++++
+ ras-mce-handler.c  |   3 +
+ ras-record.h       |   3 +
+ ras-report-json.c  |  81 +++++++++++++++++
+ ras-report.h       |   3 +
+ 10 files changed, 341 insertions(+), 1 deletion(-)
+ create mode 100644 ras-kmsg-amdgpu.c
+
+diff --git a/Makefile.am b/Makefile.am
+index da6ef46..328fa49 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -135,7 +135,7 @@ if WITH_SIGNAL
+    rasdaemon_SOURCES += ras-signal-handler.c
+ endif
+ if WITH_KMSG_MONITOR
+-   rasdaemon_SOURCES += ras-kmsg.c ras-time.c
++   rasdaemon_SOURCES += ras-kmsg.c ras-time.c ras-kmsg-amdgpu.c
+ endif
+ 
+ if WITH_POISON_PAGE_STAT
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index f498e24..2816505 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -131,6 +131,7 @@ MC_CE_STAT_THRESHOLD=2000
+ POISON_STAT_THRESHOLD=102400
+ 
+ ERST_DELETE=1
++AMDGPU_MCA_ENABLED=0
+ 
+ # EDPC config
+ #
+diff --git a/ras-events.c b/ras-events.c
+index d40f29e..88c83df 100644
+--- a/ras-events.c
++++ b/ras-events.c
+@@ -624,6 +624,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
+ 			} else if (size > 0) {
+ 				kmsg_buf[size] = '\0';
+ 				kmsg_match(kmsg_buf);
++				amdgpu_tracer_match(kmsg_buf);
+ 				memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX);
+ 			} else {
+ 				count_nready++;
+diff --git a/ras-kmsg-amdgpu.c b/ras-kmsg-amdgpu.c
+new file mode 100644
+index 0000000..c46525a
+--- /dev/null
++++ b/ras-kmsg-amdgpu.c
+@@ -0,0 +1,219 @@
++#include "ras-time.h"
++#define _GNU_SOURCE
++#include <regex.h>
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <dirent.h>
++#include <sys/syslog.h>
++#include <sys/time.h>
++#include <traceevent/event-parse.h>
++#include "ras-logger.h"
++#include "ras-report.h"
++
++#include "ras-kmsg.h"
++#include "trigger.h"
++
++#define AMDGPU_ERROR_HEADER	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: Accelerator Check Architecture events logged\n"
++#define AMDGPU_ERROR_STATUS	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].STATUS=(0x[0-9A-Fa-f]+)\n"
++#define AMDGPU_ERROR_ADDR	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].ADDR=(0x[0-9A-Fa-f]+)\n"
++#define AMDGPU_ERROR_MISC0	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].MISC0=(0x[0-9A-Fa-f]+)\n"
++#define AMDGPU_ERROR_IPID	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].IPID=(0x[0-9A-Fa-f]+)\n"
++#define AMDGPU_ERROR_SYND	".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].SYND=(0x[0-9A-Fa-f]+)\n"
++
++#define AMDGPU_MCA_ENABLED	"AMDGPU_MCA_ENABLED"
++
++static struct amdgpu_tracer *amdgpu_tracer;
++static struct amdgpu_error *amdgpu_error;
++static int amdgpu_mca_enable;
++
++static void report_amdgpu_mca(struct amdgpu_error *e)
++{
++#ifdef HAVE_MCE
++	struct ras_mc_offline_event event;
++
++	event.smca = true;
++	event.family = 0x17;
++	event.model = 0x17;
++	event.bank = 1;
++	event.status = e->status;
++	event.synd = e->synd;
++	event.ipid = e->ipid;
++	event.addr = e->addr;
++	event.misc0 = e->misc0;
++	event.domain = e->seq;
++	event.bus = e->bus;
++	event.device = e->dev;
++	event.function = e->func;
++
++	ras_offline_mce_event(&event);
++#endif
++}
++
++static void report_amdgpu_error(struct amdgpu_error *e)
++{
++	if (amdgpu_mca_enable && e->ipid && e->status)
++		report_amdgpu_mca(e);
++	else
++		report_amdgpu_error_json(e);
++}
++
++static void regex_group(regmatch_t *m, int i, const char *line, char *buf)
++{
++	int e, s;
++
++	s = m[i].rm_so;
++	e = m[i].rm_eo;
++	if (s >= 0)
++		snprintf(buf, e - s + 1, "%s", line + s);
++	else
++		buf = NULL;
++}
++
++void amdgpu_tracer_match(char *msg)
++{
++	regmatch_t matches[10];
++	regex_t *re;
++	char buf[128];
++	int ret;
++
++	ret = regexec(re = &(amdgpu_tracer->header), msg, 2, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		if (amdgpu_error->tracing) {
++			report_amdgpu_error(amdgpu_error);
++		}
++
++		memset(amdgpu_error, 0, sizeof(*amdgpu_error));
++		amdgpu_error->tracing = 1;
++
++		get_kmsg_time(msg, amdgpu_error->timestamp);
++
++		regex_group(matches, 1, msg, buf);
++		sscanf(buf, "%x:%x:%x.%x",
++		       &amdgpu_error->seq,
++		       &amdgpu_error->bus,
++		       &amdgpu_error->dev,
++		       &amdgpu_error->func);
++
++		return;
++	}
++
++	ret = regexec(re = &amdgpu_tracer->status, msg, 3, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, msg, buf);
++		amdgpu_error->status = strtoull(buf, NULL, 16);
++
++		return;
++	}
++
++	ret = regexec(re = &amdgpu_tracer->addr, msg, 3, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, msg, buf);
++		amdgpu_error->addr = strtoull(buf, NULL, 16);
++
++		return;
++	}
++
++	ret = regexec(re = &amdgpu_tracer->misc0, msg, 3, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, msg, buf);
++		amdgpu_error->misc0 = strtoull(buf, NULL, 16);
++
++		return;
++	}
++
++	ret = regexec(re = &amdgpu_tracer->ipid, msg, 3, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, msg, buf);
++		amdgpu_error->ipid = strtoull(buf, NULL, 16);
++
++		return;
++	}
++
++	ret = regexec(re = &amdgpu_tracer->synd, msg, 3, matches, 0);
++	if (ret > REG_NOMATCH) {
++		goto error;
++	} else if (!ret) {
++		regex_group(matches, 2, msg, buf);
++		amdgpu_error->synd = strtoull(buf, NULL, 16);
++
++		report_amdgpu_error(amdgpu_error);
++		amdgpu_error->tracing = 0;
++
++		return;
++	}
++
++error:
++	if (ret == REG_NOMATCH)
++		return;
++	regerror(ret, re, buf, sizeof(buf));
++	printf("Regex execution error: %s\n", buf);
++	return;
++}
++
++int amdgpu_tracer_destroy(void)
++{
++	log(ALL, LOG_INFO, "amdgpu tracer destroy\n");
++
++	if (!amdgpu_error)
++		free(amdgpu_error);
++
++	if (!amdgpu_tracer)
++		free(amdgpu_tracer);
++
++	return 0;
++}
++
++static int init_reg(regex_t *re, const char *str)
++{
++	char buf[128];
++	int ret = 0;
++
++	ret = regcomp(re, str, REG_EXTENDED);
++	if (ret) {
++		regerror(ret, re, buf, sizeof(buf));
++		printf("Regex execution error: %s\n", buf);
++		return ret;
++	}
++
++	return ret;
++}
++
++int amdgpu_tracer_init(void)
++{
++	char *s;
++
++	s = getenv(AMDGPU_MCA_ENABLED);
++	if (!s || strcmp(s, "1"))
++		amdgpu_mca_enable = 0;
++	else
++		amdgpu_mca_enable = 1;
++
++	amdgpu_error = calloc(1, sizeof(struct amdgpu_error));
++	if (!amdgpu_error)
++		return -1;
++
++	amdgpu_tracer = calloc(1, sizeof(struct amdgpu_tracer));
++	if (!amdgpu_tracer)
++		return -1;
++
++	if (init_reg(&amdgpu_tracer->header, AMDGPU_ERROR_HEADER) ||
++	    init_reg(&amdgpu_tracer->status, AMDGPU_ERROR_STATUS) ||
++	    init_reg(&amdgpu_tracer->addr, AMDGPU_ERROR_ADDR) ||
++	    init_reg(&amdgpu_tracer->misc0, AMDGPU_ERROR_MISC0) ||
++	    init_reg(&amdgpu_tracer->ipid, AMDGPU_ERROR_IPID) ||
++	    init_reg(&amdgpu_tracer->synd, AMDGPU_ERROR_SYND))
++		log(ALL, LOG_ERR, "amdgpu tracer init failed\n");
++
++	return 0;
++}
+\ No newline at end of file
+diff --git a/ras-kmsg.c b/ras-kmsg.c
+index 2dd47d6..deeb475 100644
+--- a/ras-kmsg.c
++++ b/ras-kmsg.c
+@@ -72,6 +72,8 @@ int kmsg_tracer_destroy(void)
+ 	}
+ 	free(kmsg_tracer);
+ 
++	amdgpu_tracer_destroy();
++
+ 	return 0;
+ }
+ 
+@@ -82,6 +84,8 @@ int kmsg_tracer_init(void)
+ 	char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp;
+ 	char *kmsg_tracer_group_key, *token;
+ 
++	amdgpu_tracer_init();
++
+ 	s = getenv(KMSG_TRACE_END);
+ 	if (!s)
+ 		kmsg_trace_end = 0;
+diff --git a/ras-kmsg.h b/ras-kmsg.h
+index f31125f..9e34da5 100644
+--- a/ras-kmsg.h
++++ b/ras-kmsg.h
+@@ -3,6 +3,7 @@
+ #define __RAS_KMSG_H
+ 
+ #include <regex.h>
++#include <stdint.h>
+ 
+ /**
+  * Kernel message tracer related definitions
+@@ -40,8 +41,32 @@ struct kmsg_tracer_info {
+ 	} info;
+ };
+ 
++struct amdgpu_tracer {
++	regex_t header;
++	regex_t status;
++	regex_t addr;
++	regex_t misc0;
++	regex_t ipid;
++	regex_t synd;
++};
++
++struct amdgpu_error {
++	char timestamp[64];
++	int seq, bus, dev, func;
++	int tracing;
++	uint64_t status;
++	uint64_t addr;
++	uint64_t misc0;
++	uint64_t ipid;
++	uint64_t synd;
++};
++
+ int kmsg_tracer_init(void);
+ int kmsg_tracer_destroy(void);
+ int kmsg_match(char *msg);
+ 
++void amdgpu_tracer_match(char *msg);
++int amdgpu_tracer_destroy(void);
++int amdgpu_tracer_init(void);
++
+ #endif
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index b61976a..fc2e8d4 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -491,6 +491,9 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event)
+ 
+ 	trace_seq_init(&s);
+ 	report_mce_offline(&s, mce, priv);
++#ifdef HAVE_JSON_REPORT
++	report_mce_offline_json(&s, mce, event);
++#endif
+ 	trace_seq_do_printf(&s);
+ 	fflush(stdout);
+ 	trace_seq_destroy(&s);
+diff --git a/ras-record.h b/ras-record.h
+index 416f679..d0230f7 100644
+--- a/ras-record.h
++++ b/ras-record.h
+@@ -46,6 +46,9 @@ struct ras_mc_offline_event {
+ 	uint64_t ipid;
+ 	uint64_t synd;
+ 	uint64_t status;
++	uint64_t addr;
++	uint64_t misc0;
++	int domain, bus, device, function;
+ };
+ 
+ struct ras_aer_event {
+diff --git a/ras-report-json.c b/ras-report-json.c
+index e28cfac..577e856 100644
+--- a/ras-report-json.c
++++ b/ras-report-json.c
+@@ -319,3 +319,84 @@ out:
+ 	trace_seq_destroy(&seq);
+ }
+ #endif
++
++void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce,
++			     struct ras_mc_offline_event *e)
++{
++	char tmpbuf[128] = {0}, pci_name[128] = {0};
++	u16 vendor, device;
++
++	if (!s || !e || !mce || !json_report)
++		return;
++
++	snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->domain, e->bus, e->device, e->function);
++	get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device);
++
++	trace_seq_printf(s,
++			 "\n{ \"%s\": \"amdgpu_ras_event\", "	\
++			 "\"timestamp\": \"%s\", "		\
++			 "\"bank_name\": \"%s\", "			\
++			 "\"bank\": %d, "			\
++			 "\"mcastatus_msg\": \"%s\", "			\
++			 "\"mcistatus_msg\": \"%s\", "			\
++			 "\"mc_location\": \"%s\", "			\
++			 "\"error_msg\": \"%s\", "			\
++			 "\"pci_bdf\": \"%s\", "			\
++			 "\"pci_dev_name\": \"%s\", "			\
++			 "\"vendor_id\": \"%#x\", "			\
++			 "\"device_id\": \"%#x\", "			\
++			 "\"status\": \"%#lx\", "				\
++			 "\"addr\": \"%#lx\", "				\
++			 "\"misc0\": \"%#lx\", "				\
++			 "\"ipid\": \"%#lx\", "				\
++			 "\"synd\": \"%#lx\" }\n",
++			 JSON_REPORT_KEY,
++			 (*mce->timestamp) ? mce->timestamp : NONE,
++			 (*mce->bank_name) ? mce->bank_name : NONE,
++			 mce->bank,
++			 (*mce->mcastatus_msg) ? mce->mcastatus_msg : NONE,
++			 (*mce->mcistatus_msg) ? mce->mcistatus_msg : NONE,
++			 (*mce->mc_location) ? mce->mc_location : NONE,
++			 (*mce->error_msg) ? mce->error_msg : NONE,
++			 tmpbuf, pci_name, vendor, device,
++			 e->status, e->addr, e->misc0, e->ipid, e->synd);
++}
++
++void report_amdgpu_error_json(struct amdgpu_error *e)
++{
++	struct trace_seq seq;
++	char tmpbuf[128] = {0}, pci_name[128] = {0};
++	u16 vendor, device;
++
++	if (!e || !json_report)
++		return;
++
++	snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->seq, e->bus, e->dev, e->func);
++	get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device);
++
++	trace_seq_init(&seq);
++	trace_seq_printf(&seq,
++			 "\n{ \"%s\": \"amdgpu_ras_event\", "	\
++			 "\"timestamp\": \"%s\", "	\
++			 "\"pci_dev_name\": \"%s\", "			\
++			 "\"vendor_id\": \"%#x\", "			\
++			 "\"device_id\": \"%#x\", "			\
++			 "\"status\": \"0x%#lx\", "	\
++			 "\"addr\": \"0x%#lx\", "		\
++			 "\"misc0\": \"0x%#lx\", "	\
++			 "\"ipid\": \"0x%#lx\", "		\
++			 "\"synd\": \"0x%#lx\" }",
++			 JSON_REPORT_KEY,
++			 (*e->timestamp) ? e->timestamp : "",
++			 pci_name, vendor, device,
++			 e->status,
++			 e->addr,
++			 e->misc0,
++			 e->ipid,
++			 e->synd);
++
++	trace_seq_do_printf(&seq);
++	printf("\n");
++	fflush(stdout);
++	trace_seq_destroy(&seq);
++}
+diff --git a/ras-report.h b/ras-report.h
+index 7f7f304..7066a74 100644
+--- a/ras-report.h
++++ b/ras-report.h
+@@ -130,6 +130,9 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
+ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
+ void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
+ void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev);
++void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce,
++			     struct ras_mc_offline_event *e);
++void report_amdgpu_error_json(struct amdgpu_error *e);
+ #endif
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1021-anolis-config-disable-page-offline-defalut.patch b/1021-anolis-config-disable-page-offline-defalut.patch
new file mode 100644
index 0000000000000000000000000000000000000000..880db2207a175d13ecab251b40fa4458d5e085e4
--- /dev/null
+++ b/1021-anolis-config-disable-page-offline-defalut.patch
@@ -0,0 +1,26 @@
+From 344b4080d5d093123de8973b74f8289201931483 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Mon, 10 Mar 2025 11:27:45 +0800
+Subject: [PATCH 21/30] anolis: config: disable page offline defalut
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ misc/rasdaemon.env | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 2816505..1833f1b 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -54,7 +54,7 @@ ROW_CE_ACTION="off"
+ #          Requires an uptodate kernel. Might not be successfull.
+ # soft-then-hard   First try to soft offline, then try hard offlining.
+ # Note: default offline choice is "soft".
+-PAGE_CE_ACTION="soft"
++PAGE_CE_ACTION="off"
+ 
+ # CPU Online Fault Isolation
+ # Whether to enable cpu online fault isolation (yes|no).
+-- 
+2.43.5
+
diff --git a/1022-anolis-disable-block-and-dev-error-default.patch b/1022-anolis-disable-block-and-dev-error-default.patch
new file mode 100644
index 0000000000000000000000000000000000000000..83ba86cd54d1979765171b83de7e5a2664843225
--- /dev/null
+++ b/1022-anolis-disable-block-and-dev-error-default.patch
@@ -0,0 +1,26 @@
+From b5d1f625e8cee3697965e975483e523543d38b4b Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Wed, 12 Mar 2025 09:59:55 +0800
+Subject: [PATCH 22/30] anolis: disable block and dev error default
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ misc/rasdaemon.env | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 1833f1b..198b050 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h"
+ # Prevent excessive isolation from causing an avalanche effect
+ CPU_ISOLATION_LIMIT="10"
+ 
+-DISABLE="json_report,kmsg_monitor"
++DISABLE="json_report,kmsg_monitor,block:block_rq_complete,devlink:devlink_health_report"
+ 
+ # Event Trigger
+ 
+-- 
+2.43.5
+
diff --git a/1023-anolis-add-nvml-in-tree.patch b/1023-anolis-add-nvml-in-tree.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9d0718c05f30299dadfa9d37d93ee95acb325f77
--- /dev/null
+++ b/1023-anolis-add-nvml-in-tree.patch
@@ -0,0 +1,11441 @@
+From 46af414d74baab0e03d716e3af7e77ea3186c47e Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 17 Apr 2025 17:17:55 +0800
+Subject: [PATCH 23/30] anolis: add nvml in tree
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am     |     1 +
+ contrib/nvml.h  | 11370 ++++++++++++++++++++++++++++++++++++++++++++++
+ contrib/nvml.py |    13 +-
+ 3 files changed, 11381 insertions(+), 3 deletions(-)
+ create mode 100644 contrib/nvml.h
+
+diff --git a/Makefile.am b/Makefile.am
+index 328fa49..4aba962 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -25,6 +25,7 @@ EXTRA_DIST = \
+ 	misc/rasdaemon.env \
+ 	misc/notices \
+ 	contrib/nvml.py \
++	contrib/nvml.h \
+ 	contrib/*_trigger
+ 
+ CLEANFILES= \
+diff --git a/contrib/nvml.h b/contrib/nvml.h
+new file mode 100644
+index 0000000..937332e
+--- /dev/null
++++ b/contrib/nvml.h
+@@ -0,0 +1,11370 @@
++/*
++ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
++ *
++ * NOTICE TO USER:
++ *
++ * This source code is subject to NVIDIA ownership rights under U.S. and
++ * international Copyright laws.  Users and possessors of this source code
++ * are hereby granted a nonexclusive, royalty-free license to use this code
++ * in individual and commercial software.
++ *
++ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
++ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
++ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
++ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
++ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
++ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
++ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
++ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
++ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
++ * OR PERFORMANCE OF THIS SOURCE CODE.
++ *
++ * U.S. Government End Users.   This source code is a "commercial item" as
++ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
++ * "commercial computer  software"  and "commercial computer software
++ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
++ * and is provided to the U.S. Government only as a commercial end item.
++ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
++ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
++ * source code with only those rights set forth herein.
++ *
++ * Any use of this source code in individual and commercial software must
++ * include, in the user documentation and internal comments to the code,
++ * the above Disclaimer and U.S. Government End Users Notice.
++ */
++
++/*
++NVML API Reference
++
++The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and
++managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
++3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
++tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
++
++API Documentation
++
++Supported platforms:
++- Windows:     Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
++- Linux:       32-bit and 64-bit
++- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
++
++Supported products:
++- Full Support
++    - All Tesla products, starting with the Fermi architecture
++    - All Quadro products, starting with the Fermi architecture
++    - All vGPU Software products, starting with the Kepler architecture
++    - Selected GeForce Titan products
++- Limited Support
++    - All Geforce products, starting with the Fermi architecture
++
++The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
++not be added to the system path by default. To dynamically link to NVML, add this path to the PATH
++environmental variable. To dynamically load NVML, call LoadLibrary with this path.
++
++On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
++and 64 bit NVML libraries will be installed.
++
++Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
++*/
++
++#ifndef __nvml_nvml_h__
++#define __nvml_nvml_h__
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++ * On Windows, set up methods for DLL export
++ * define NVML_STATIC_IMPORT when using nvml_loader library
++ */
++#if defined _WINDOWS
++    #if !defined NVML_STATIC_IMPORT
++        #if defined NVML_LIB_EXPORT
++            #define DECLDIR __declspec(dllexport)
++        #else
++            #define DECLDIR __declspec(dllimport)
++        #endif
++    #else
++        #define DECLDIR
++    #endif
++#else
++    #define DECLDIR
++#endif
++
++    #define NVML_MCDM_SUPPORT
++
++/**
++ * NVML API versioning support
++ */
++#define NVML_API_VERSION            12
++#define NVML_API_VERSION_STR        "12"
++/**
++ * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs.
++ * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this
++ * guard if you need to support older versions of the API
++ */
++#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS
++    #define nvmlInit                                    nvmlInit_v2
++    #define nvmlDeviceGetPciInfo                        nvmlDeviceGetPciInfo_v3
++    #define nvmlDeviceGetCount                          nvmlDeviceGetCount_v2
++    #define nvmlDeviceGetHandleByIndex                  nvmlDeviceGetHandleByIndex_v2
++    #define nvmlDeviceGetHandleByPciBusId               nvmlDeviceGetHandleByPciBusId_v2
++    #define nvmlDeviceGetNvLinkRemotePciInfo            nvmlDeviceGetNvLinkRemotePciInfo_v2
++    #define nvmlDeviceRemoveGpu                         nvmlDeviceRemoveGpu_v2
++    #define nvmlDeviceGetGridLicensableFeatures         nvmlDeviceGetGridLicensableFeatures_v4
++    #define nvmlEventSetWait                            nvmlEventSetWait_v2
++    #define nvmlDeviceGetAttributes                     nvmlDeviceGetAttributes_v2
++    #define nvmlComputeInstanceGetInfo                  nvmlComputeInstanceGetInfo_v2
++    #define nvmlDeviceGetComputeRunningProcesses        nvmlDeviceGetComputeRunningProcesses_v3
++    #define nvmlDeviceGetGraphicsRunningProcesses       nvmlDeviceGetGraphicsRunningProcesses_v3
++    #define nvmlDeviceGetMPSComputeRunningProcesses     nvmlDeviceGetMPSComputeRunningProcesses_v3
++    #define nvmlBlacklistDeviceInfo_t                   nvmlExcludedDeviceInfo_t
++    #define nvmlGetBlacklistDeviceCount                 nvmlGetExcludedDeviceCount
++    #define nvmlGetBlacklistDeviceInfoByIndex           nvmlGetExcludedDeviceInfoByIndex
++    #define nvmlDeviceGetGpuInstancePossiblePlacements  nvmlDeviceGetGpuInstancePossiblePlacements_v2
++    #define nvmlVgpuInstanceGetLicenseInfo              nvmlVgpuInstanceGetLicenseInfo_v2
++    #define nvmlDeviceGetDriverModel                    nvmlDeviceGetDriverModel_v2
++#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS
++
++#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \
++                                                      (ver << 24U))
++
++/***************************************************************************************************/
++/** @defgroup nvmlDeviceStructs Device Structs
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Special constant that some fields take when they are not available.
++ * Used when only part of the struct is not available.
++ *
++ * Each structure explicitly states when to check for this value.
++ */
++#define NVML_VALUE_NOT_AVAILABLE (-1)
++
++typedef struct nvmlDevice_st* nvmlDevice_t;
++
++/**
++ * Buffer size guaranteed to be large enough for pci bus id
++ */
++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE      32
++
++/**
++ * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy
++ */
++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE   16
++
++/**
++ * PCI information about a GPU device.
++ */
++typedef struct
++{
++    unsigned int version;            //!< The version number of this struct
++    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
++    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
++    unsigned int device;             //!< The device's id on the bus, 0 to 31
++
++    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
++    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
++
++    unsigned int baseClass;          //!< The 8-bit PCI base class code
++    unsigned int subClass;           //!< The 8-bit PCI sub class code
++
++    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
++} nvmlPciInfoExt_v1_t;
++typedef nvmlPciInfoExt_v1_t  nvmlPciInfoExt_t;
++#define nvmlPciInfoExt_v1 NVML_STRUCT_VERSION(PciInfoExt, 1)
++
++/**
++ * PCI information about a GPU device.
++ */
++typedef struct nvmlPciInfo_st
++{
++    char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
++    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
++    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
++    unsigned int device;             //!< The device's id on the bus, 0 to 31
++    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
++
++    // Added in NVML 2.285 API
++    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
++
++    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
++} nvmlPciInfo_t;
++
++/**
++ * PCI format string for ::busIdLegacy
++ */
++#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT           "%04X:%02X:%02X.0"
++
++/**
++ * PCI format string for ::busId
++ */
++#define NVML_DEVICE_PCI_BUS_ID_FMT                  "%08X:%02X:%02X.0"
++
++/**
++ * Utility macro for filling the pci bus id format from a nvmlPciInfo_t
++ */
++#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo)    (pciInfo)->domain, \
++                                                    (pciInfo)->bus,    \
++                                                    (pciInfo)->device
++
++/**
++ * Detailed ECC error counts for a device.
++ *
++ * @deprecated  Different GPU families can have different memory error counters
++ *              See \ref nvmlDeviceGetMemoryErrorCounter
++ */
++typedef struct nvmlEccErrorCounts_st
++{
++    unsigned long long l1Cache;      //!< L1 cache errors
++    unsigned long long l2Cache;      //!< L2 cache errors
++    unsigned long long deviceMemory; //!< Device memory errors
++    unsigned long long registerFile; //!< Register file errors
++} nvmlEccErrorCounts_t;
++
++/**
++ * Utilization information for a device.
++ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
++ */
++typedef struct nvmlUtilization_st
++{
++    unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
++    unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
++} nvmlUtilization_t;
++
++/**
++ * Memory allocation information for a device (v1).
++ * The total amount is equal to the sum of the amounts of free and used memory.
++ */
++typedef struct nvmlMemory_st
++{
++    unsigned long long total;        //!< Total physical device memory (in bytes)
++    unsigned long long free;         //!< Unallocated device memory (in bytes)
++    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
++                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
++} nvmlMemory_t;
++
++/**
++ * Memory allocation information for a device (v2).
++ *
++ * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output.
++ */
++typedef struct nvmlMemory_v2_st
++{
++    unsigned int version;            //!< Structure format version (must be 2)
++    unsigned long long total;        //!< Total physical device memory (in bytes)
++    unsigned long long reserved;     //!< Device memory (in bytes) reserved for system use (driver or firmware)
++    unsigned long long free;         //!< Unallocated device memory (in bytes)
++    unsigned long long used;         //!< Allocated device memory (in bytes).
++} nvmlMemory_v2_t;
++
++#define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2)
++
++/**
++ * BAR1 Memory allocation Information for a device
++ */
++typedef struct nvmlBAR1Memory_st
++{
++    unsigned long long bar1Total;    //!< Total BAR1 Memory (in bytes)
++    unsigned long long bar1Free;     //!< Unallocated BAR1 Memory (in bytes)
++    unsigned long long bar1Used;     //!< Allocated Used Memory (in bytes)
++}nvmlBAR1Memory_t;
++
++/**
++ * Information about running compute processes on the GPU, legacy version
++ * for older versions of the API.
++ */
++typedef struct nvmlProcessInfo_v1_st
++{
++    unsigned int        pid;                //!< Process ID
++    unsigned long long  usedGpuMemory;      //!< Amount of used GPU memory in bytes.
++                                            //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
++                                            //! because Windows KMD manages all the memory and not the NVIDIA driver
++} nvmlProcessInfo_v1_t;
++
++/**
++ * Information about running compute processes on the GPU
++ */
++typedef struct nvmlProcessInfo_v2_st
++{
++    unsigned int        pid;                //!< Process ID
++    unsigned long long  usedGpuMemory;      //!< Amount of used GPU memory in bytes.
++                                            //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
++                                            //! because Windows KMD manages all the memory and not the NVIDIA driver
++    unsigned int        gpuInstanceId;      //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to
++                                            //  0xFFFFFFFF otherwise.
++    unsigned int        computeInstanceId;  //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to
++                                            //  0xFFFFFFFF otherwise.
++} nvmlProcessInfo_v2_t, nvmlProcessInfo_t;
++
++/**
++ * Information about running process on the GPU with protected memory
++ */
++typedef struct
++{
++    unsigned int        pid;                      //!< Process ID
++    unsigned long long  usedGpuMemory;            //!< Amount of used GPU memory in bytes.
++                                                  //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
++                                                  //! because Windows KMD manages all the memory and not the NVIDIA driver
++    unsigned int        gpuInstanceId;            //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is
++                                                  //  set to 0xFFFFFFFF otherwise.
++    unsigned int        computeInstanceId;        //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId
++                                                  //  is set to 0xFFFFFFFF otherwise.
++    unsigned long long  usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes.
++} nvmlProcessDetail_v1_t;
++
++/**
++ * Information about all running processes on the GPU for the given mode
++ */
++typedef struct
++{
++    unsigned int           version;             //!< Struct version, MUST be nvmlProcessDetailList_v1
++    unsigned int           mode;                //!< Process mode(Compute/Graphics/MPSCompute)
++    unsigned int           numProcArrayEntries; //!< Number of process entries in procArray
++    nvmlProcessDetail_v1_t *procArray;          //!< Process array
++} nvmlProcessDetailList_v1_t;
++
++typedef nvmlProcessDetailList_v1_t nvmlProcessDetailList_t;
++
++/**
++ * nvmlProcessDetailList version
++ */
++#define nvmlProcessDetailList_v1 NVML_STRUCT_VERSION(ProcessDetailList, 1)
++
++typedef struct nvmlDeviceAttributes_st
++{
++    unsigned int multiprocessorCount;       //!< Streaming Multiprocessor count
++    unsigned int sharedCopyEngineCount;     //!< Shared Copy Engine count
++    unsigned int sharedDecoderCount;        //!< Shared Decoder Engine count
++    unsigned int sharedEncoderCount;        //!< Shared Encoder Engine count
++    unsigned int sharedJpegCount;           //!< Shared JPEG Engine count
++    unsigned int sharedOfaCount;            //!< Shared OFA Engine count
++    unsigned int gpuInstanceSliceCount;     //!< GPU instance slice count
++    unsigned int computeInstanceSliceCount; //!< Compute instance slice count
++    unsigned long long memorySizeMB;        //!< Device memory size (in MiB)
++} nvmlDeviceAttributes_t;
++
++/**
++ * C2C Mode information for a device
++ */
++typedef struct
++{
++    unsigned int isC2cEnabled;
++} nvmlC2cModeInfo_v1_t;
++
++#define nvmlC2cModeInfo_v1 NVML_STRUCT_VERSION(C2cModeInfo, 1)
++
++/**
++ * Possible values that classify the remap availability for each bank. The max
++ * field will contain the number of banks that have maximum remap availability
++ * (all reserved rows are available). None means that there are no reserved
++ * rows available.
++ */
++typedef struct nvmlRowRemapperHistogramValues_st
++{
++    unsigned int max;
++    unsigned int high;
++    unsigned int partial;
++    unsigned int low;
++    unsigned int none;
++} nvmlRowRemapperHistogramValues_t;
++
++/**
++ * Enum to represent type of bridge chip
++ */
++typedef enum nvmlBridgeChipType_enum
++{
++    NVML_BRIDGE_CHIP_PLX = 0,
++    NVML_BRIDGE_CHIP_BRO4 = 1
++}nvmlBridgeChipType_t;
++
++/**
++ * Maximum number of NvLink links supported
++ */
++#define NVML_NVLINK_MAX_LINKS 18
++
++/**
++ * Enum to represent the NvLink utilization counter packet units
++ */
++typedef enum nvmlNvLinkUtilizationCountUnits_enum
++{
++    NVML_NVLINK_COUNTER_UNIT_CYCLES =  0,     // count by cycles
++    NVML_NVLINK_COUNTER_UNIT_PACKETS = 1,     // count by packets
++    NVML_NVLINK_COUNTER_UNIT_BYTES   = 2,     // count by bytes
++    NVML_NVLINK_COUNTER_UNIT_RESERVED = 3,    // count reserved for internal use
++    // this must be last
++    NVML_NVLINK_COUNTER_UNIT_COUNT
++} nvmlNvLinkUtilizationCountUnits_t;
++
++/**
++ * Enum to represent the NvLink utilization counter packet types to count
++ *  ** this is ONLY applicable with the units as packets or bytes
++ *  ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
++ *  ** all packet filter descriptions are target GPU centric
++ *  ** these can be "OR'd" together
++ */
++typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
++{
++    NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1,     // no operation packets
++    NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2,     // read packets
++    NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4,     // write packets
++    NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8,     // reduction atomic requests
++    NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10,    // non-reduction atomic requests
++    NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20,    // flush requests
++    NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40,    // responses with data
++    NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80,    // responses without data
++    NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF     // all packets
++} nvmlNvLinkUtilizationCountPktTypes_t;
++
++/**
++ * Struct to define the NVLINK counter controls
++ */
++typedef struct nvmlNvLinkUtilizationControl_st
++{
++    nvmlNvLinkUtilizationCountUnits_t units;
++    nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
++} nvmlNvLinkUtilizationControl_t;
++
++/**
++ * Enum to represent NvLink queryable capabilities
++ */
++typedef enum nvmlNvLinkCapability_enum
++{
++    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
++    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
++    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
++    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
++    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
++    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
++    // should be last
++    NVML_NVLINK_CAP_COUNT
++} nvmlNvLinkCapability_t;
++
++/**
++ * Enum to represent NvLink queryable error counters
++ */
++typedef enum nvmlNvLinkErrorCounter_enum
++{
++    NVML_NVLINK_ERROR_DL_REPLAY   = 0,     // Data link transmit replay error counter
++    NVML_NVLINK_ERROR_DL_RECOVERY = 1,     // Data link transmit recovery error counter
++    NVML_NVLINK_ERROR_DL_CRC_FLIT = 2,     // Data link receive flow control digit CRC error counter
++    NVML_NVLINK_ERROR_DL_CRC_DATA = 3,     // Data link receive data CRC error counter
++    NVML_NVLINK_ERROR_DL_ECC_DATA = 4,     // Data link receive data ECC error counter
++
++    // this must be last
++    NVML_NVLINK_ERROR_COUNT
++} nvmlNvLinkErrorCounter_t;
++
++/**
++ * Enum to represent NvLink's remote device type
++ */
++typedef enum nvmlIntNvLinkDeviceType_enum
++{
++    NVML_NVLINK_DEVICE_TYPE_GPU     = 0x00,
++    NVML_NVLINK_DEVICE_TYPE_IBMNPU  = 0x01,
++    NVML_NVLINK_DEVICE_TYPE_SWITCH  = 0x02,
++    NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF
++} nvmlIntNvLinkDeviceType_t;
++
++/**
++ * Represents level relationships within a system between two GPUs
++ * The enums are spaced to allow for future relationships
++ */
++typedef enum nvmlGpuLevel_enum
++{
++    NVML_TOPOLOGY_INTERNAL           = 0, // e.g. Tesla K80
++    NVML_TOPOLOGY_SINGLE             = 10, // all devices that only need traverse a single PCIe switch
++    NVML_TOPOLOGY_MULTIPLE           = 20, // all devices that need not traverse a host bridge
++    NVML_TOPOLOGY_HOSTBRIDGE         = 30, // all devices that are connected to the same host bridge
++    NVML_TOPOLOGY_NODE               = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges
++    NVML_TOPOLOGY_SYSTEM             = 50  // all devices in the system
++
++    // there is purposefully no COUNT here because of the need for spacing above
++} nvmlGpuTopologyLevel_t;
++
++/* Compatibility for CPU->NODE renaming */
++#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE
++
++/* P2P Capability Index Status*/
++typedef enum nvmlGpuP2PStatus_enum
++{
++    NVML_P2P_STATUS_OK     = 0,
++    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
++    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
++    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
++    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
++    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
++    NVML_P2P_STATUS_NOT_SUPPORTED,
++    NVML_P2P_STATUS_UNKNOWN
++
++} nvmlGpuP2PStatus_t;
++
++/* P2P Capability Index*/
++typedef enum nvmlGpuP2PCapsIndex_enum
++{
++    NVML_P2P_CAPS_INDEX_READ = 0,
++    NVML_P2P_CAPS_INDEX_WRITE = 1,
++    NVML_P2P_CAPS_INDEX_NVLINK = 2,
++    NVML_P2P_CAPS_INDEX_ATOMICS = 3,
++    NVML_P2P_CAPS_INDEX_PCI = 4,
++    /*
++     * DO NOT USE! NVML_P2P_CAPS_INDEX_PROP is deprecated.
++     * Use NVML_P2P_CAPS_INDEX_PCI instead.
++     */
++    NVML_P2P_CAPS_INDEX_PROP = NVML_P2P_CAPS_INDEX_PCI,
++    NVML_P2P_CAPS_INDEX_UNKNOWN = 5,
++}nvmlGpuP2PCapsIndex_t;
++
++/**
++ * Maximum limit on Physical Bridges per Board
++ */
++#define NVML_MAX_PHYSICAL_BRIDGE                         (128)
++
++/**
++ * Information about the Bridge Chip Firmware
++ */
++typedef struct nvmlBridgeChipInfo_st
++{
++    nvmlBridgeChipType_t type;                  //!< Type of Bridge Chip
++    unsigned int fwVersion;                     //!< Firmware Version. 0=Version is unavailable
++}nvmlBridgeChipInfo_t;
++
++/**
++ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate
++ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
++ */
++typedef struct nvmlBridgeChipHierarchy_st
++{
++    unsigned char  bridgeCount;                 //!< Number of Bridge Chips on the Board
++    nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
++}nvmlBridgeChipHierarchy_t;
++
++/**
++ *  Represents Type of Sampling Event
++ */
++typedef enum nvmlSamplingType_enum
++{
++    NVML_TOTAL_POWER_SAMPLES        = 0, //!< To represent total power drawn by GPU
++    NVML_GPU_UTILIZATION_SAMPLES    = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
++    NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
++    NVML_ENC_UTILIZATION_SAMPLES    = 3, //!< To represent percent of time during which NVENC remains busy
++    NVML_DEC_UTILIZATION_SAMPLES    = 4, //!< To represent percent of time during which NVDEC remains busy
++    NVML_PROCESSOR_CLK_SAMPLES      = 5, //!< To represent processor clock samples
++    NVML_MEMORY_CLK_SAMPLES         = 6, //!< To represent memory clock samples
++    NVML_MODULE_POWER_SAMPLES       = 7, //!< To represent module power samples for total module starting Grace Hopper
++    NVML_JPG_UTILIZATION_SAMPLES    = 8, //!< To represent percent of time during which NVJPG remains busy
++    NVML_OFA_UTILIZATION_SAMPLES    = 9, //!< To represent percent of time during which NVOFA remains busy
++
++    // Keep this last
++    NVML_SAMPLINGTYPE_COUNT
++}nvmlSamplingType_t;
++
++/**
++ * Represents the queryable PCIe utilization counters
++ */
++typedef enum nvmlPcieUtilCounter_enum
++{
++    NVML_PCIE_UTIL_TX_BYTES             = 0, // 1KB granularity
++    NVML_PCIE_UTIL_RX_BYTES             = 1, // 1KB granularity
++
++    // Keep this last
++    NVML_PCIE_UTIL_COUNT
++} nvmlPcieUtilCounter_t;
++
++/**
++ * Represents the type for sample value returned
++ */
++typedef enum nvmlValueType_enum
++{
++    NVML_VALUE_TYPE_DOUBLE = 0,
++    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
++    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
++    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
++    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
++    NVML_VALUE_TYPE_SIGNED_INT = 5,
++    NVML_VALUE_TYPE_UNSIGNED_SHORT = 6,
++
++    // Keep this last
++    NVML_VALUE_TYPE_COUNT
++}nvmlValueType_t;
++
++
++/**
++ * Union to represent different types of Value
++ */
++typedef union nvmlValue_st
++{
++    double dVal;                    //!< If the value is double
++    int siVal;                      //!< If the value is signed int
++    unsigned int uiVal;             //!< If the value is unsigned int
++    unsigned long ulVal;            //!< If the value is unsigned long
++    unsigned long long ullVal;      //!< If the value is unsigned long long
++    signed long long sllVal;        //!< If the value is signed long long
++    unsigned short usVal;           //!< If the value is unsigned short
++}nvmlValue_t;
++
++/**
++ * Information for Sample
++ */
++typedef struct nvmlSample_st
++{
++    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
++    nvmlValue_t sampleValue;            //!< Sample Value
++}nvmlSample_t;
++
++/**
++ * Represents type of perf policy for which violation times can be queried
++ */
++typedef enum nvmlPerfPolicyType_enum
++{
++    NVML_PERF_POLICY_POWER = 0,              //!< How long did power violations cause the GPU to be below application clocks
++    NVML_PERF_POLICY_THERMAL = 1,            //!< How long did thermal violations cause the GPU to be below application clocks
++    NVML_PERF_POLICY_SYNC_BOOST = 2,         //!< How long did sync boost cause the GPU to be below application clocks
++    NVML_PERF_POLICY_BOARD_LIMIT = 3,        //!< How long did the board limit cause the GPU to be below application clocks
++    NVML_PERF_POLICY_LOW_UTILIZATION = 4,    //!< How long did low utilization cause the GPU to be below application clocks
++    NVML_PERF_POLICY_RELIABILITY = 5,        //!< How long did the board reliability limit cause the GPU to be below application clocks
++
++    NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10,  //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above)
++    NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks
++
++    // Keep this last
++    NVML_PERF_POLICY_COUNT
++}nvmlPerfPolicyType_t;
++
++/**
++ * Struct to hold perf policy violation status data
++ */
++typedef struct nvmlViolationTime_st
++{
++    unsigned long long referenceTime;  //!< referenceTime represents CPU timestamp in microseconds
++    unsigned long long violationTime;  //!< violationTime in Nanoseconds
++}nvmlViolationTime_t;
++
++#define NVML_MAX_THERMAL_SENSORS_PER_GPU  3
++
++/**
++ * Represents the thermal sensor targets
++ */
++typedef enum
++{
++    NVML_THERMAL_TARGET_NONE          = 0,
++    NVML_THERMAL_TARGET_GPU           = 1,     //!< GPU core temperature requires NvPhysicalGpuHandle
++    NVML_THERMAL_TARGET_MEMORY        = 2,     //!< GPU memory temperature requires NvPhysicalGpuHandle
++    NVML_THERMAL_TARGET_POWER_SUPPLY  = 4,     //!< GPU power supply temperature requires NvPhysicalGpuHandle
++    NVML_THERMAL_TARGET_BOARD         = 8,     //!< GPU board ambient temperature requires NvPhysicalGpuHandle
++    NVML_THERMAL_TARGET_VCD_BOARD     = 9,     //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle
++    NVML_THERMAL_TARGET_VCD_INLET     = 10,    //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle
++    NVML_THERMAL_TARGET_VCD_OUTLET    = 11,    //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle
++
++    NVML_THERMAL_TARGET_ALL           = 15,
++    NVML_THERMAL_TARGET_UNKNOWN       = -1,
++} nvmlThermalTarget_t;
++
++/**
++ * Represents the thermal sensor controllers
++ */
++typedef enum
++{
++    NVML_THERMAL_CONTROLLER_NONE = 0,
++    NVML_THERMAL_CONTROLLER_GPU_INTERNAL,
++    NVML_THERMAL_CONTROLLER_ADM1032,
++    NVML_THERMAL_CONTROLLER_ADT7461,
++    NVML_THERMAL_CONTROLLER_MAX6649,
++    NVML_THERMAL_CONTROLLER_MAX1617,
++    NVML_THERMAL_CONTROLLER_LM99,
++    NVML_THERMAL_CONTROLLER_LM89,
++    NVML_THERMAL_CONTROLLER_LM64,
++    NVML_THERMAL_CONTROLLER_G781,
++    NVML_THERMAL_CONTROLLER_ADT7473,
++    NVML_THERMAL_CONTROLLER_SBMAX6649,
++    NVML_THERMAL_CONTROLLER_VBIOSEVT,
++    NVML_THERMAL_CONTROLLER_OS,
++    NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS,
++    NVML_THERMAL_CONTROLLER_NVSYSCON_E551,
++    NVML_THERMAL_CONTROLLER_MAX6649R,
++    NVML_THERMAL_CONTROLLER_ADT7473S,
++    NVML_THERMAL_CONTROLLER_UNKNOWN = -1,
++} nvmlThermalController_t;
++
++/**
++ * Struct to hold the thermal sensor settings
++ */
++typedef struct
++{
++    unsigned int   count;
++    struct
++    {
++        nvmlThermalController_t controller;
++        int defaultMinTemp;
++        int defaultMaxTemp;
++        int currentTemp;
++        nvmlThermalTarget_t target;
++    } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU];
++
++} nvmlGpuThermalSettings_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlDeviceEnums Device Enums
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Generic enable/disable enum.
++ */
++typedef enum nvmlEnableState_enum
++{
++    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
++    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
++} nvmlEnableState_t;
++
++//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
++#define nvmlFlagDefault     0x00
++//! Generic flag used to force some behavior. See description of particular functions for details.
++#define nvmlFlagForce       0x01
++
++/**
++ *  * The Brand of the GPU
++ *   */
++typedef enum nvmlBrandType_enum
++{
++    NVML_BRAND_UNKNOWN              = 0,
++    NVML_BRAND_QUADRO               = 1,
++    NVML_BRAND_TESLA                = 2,
++    NVML_BRAND_NVS                  = 3,
++    NVML_BRAND_GRID                 = 4,   // Deprecated from API reporting. Keeping definition for backward compatibility.
++    NVML_BRAND_GEFORCE              = 5,
++    NVML_BRAND_TITAN                = 6,
++    NVML_BRAND_NVIDIA_VAPPS         = 7,   // NVIDIA Virtual Applications
++    NVML_BRAND_NVIDIA_VPC           = 8,   // NVIDIA Virtual PC
++    NVML_BRAND_NVIDIA_VCS           = 9,   // NVIDIA Virtual Compute Server
++    NVML_BRAND_NVIDIA_VWS           = 10,  // NVIDIA RTX Virtual Workstation
++    NVML_BRAND_NVIDIA_CLOUD_GAMING  = 11,  // NVIDIA Cloud Gaming
++    NVML_BRAND_NVIDIA_VGAMING       = NVML_BRAND_NVIDIA_CLOUD_GAMING,  // Deprecated from API reporting. Keeping definition for backward compatibility.
++    NVML_BRAND_QUADRO_RTX           = 12,
++    NVML_BRAND_NVIDIA_RTX           = 13,
++    NVML_BRAND_NVIDIA               = 14,
++    NVML_BRAND_GEFORCE_RTX          = 15,  // Unused
++    NVML_BRAND_TITAN_RTX            = 16,  // Unused
++
++    // Keep this last
++    NVML_BRAND_COUNT
++} nvmlBrandType_t;
++
++/**
++ * Temperature thresholds.
++ */
++typedef enum nvmlTemperatureThresholds_enum
++{
++    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN      = 0, // Temperature at which the GPU will
++                                                  // shut down for HW protection
++    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN      = 1, // Temperature at which the GPU will
++                                                  // begin HW slowdown
++    NVML_TEMPERATURE_THRESHOLD_MEM_MAX       = 2, // Memory Temperature at which the GPU will
++                                                  // begin SW slowdown
++    NVML_TEMPERATURE_THRESHOLD_GPU_MAX       = 3, // GPU Temperature at which the GPU
++                                                  // can be throttled below base clock
++    NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN  = 4, // Minimum GPU Temperature that can be
++                                                  // set as acoustic threshold
++    NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, // Current temperature that is set as
++                                                  // acoustic threshold.
++    NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX  = 6, // Maximum GPU temperature that can be
++                                                  // set as acoustic threshold.
++    NVML_TEMPERATURE_THRESHOLD_GPS_CURR      = 7, // Current temperature that is set as
++                                                  // gps threshold.
++    // Keep this last
++    NVML_TEMPERATURE_THRESHOLD_COUNT
++} nvmlTemperatureThresholds_t;
++
++/**
++ * Temperature sensors.
++ */
++typedef enum nvmlTemperatureSensors_enum
++{
++    NVML_TEMPERATURE_GPU      = 0,    //!< Temperature sensor for the GPU die
++
++    // Keep this last
++    NVML_TEMPERATURE_COUNT
++} nvmlTemperatureSensors_t;
++
++/**
++ * Compute mode.
++ *
++ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
++ * Earlier CUDA versions supported a single exclusive mode,
++ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
++ */
++typedef enum nvmlComputeMode_enum
++{
++    NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
++    NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
++    NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
++    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
++
++    // Keep this last
++    NVML_COMPUTEMODE_COUNT
++} nvmlComputeMode_t;
++
++/**
++ * Max Clock Monitors available
++ */
++#define MAX_CLK_DOMAINS			32
++
++/**
++ * Clock Monitor error types
++ */
++typedef struct nvmlClkMonFaultInfo_struct {
++    /**
++     * The Domain which faulted
++     */
++    unsigned int   clkApiDomain;
++
++    /**
++     * Faults Information
++     */
++    unsigned int   clkDomainFaultMask;
++} nvmlClkMonFaultInfo_t;
++
++/**
++ * Clock Monitor Status
++ */
++typedef struct nvmlClkMonStatus_status {
++    /**
++     * Fault status Indicator
++     */
++    unsigned int  bGlobalStatus;
++
++    /**
++     * Total faulted domain numbers
++     */
++    unsigned int   clkMonListSize;
++
++    /**
++     * The fault Information structure
++     */
++    nvmlClkMonFaultInfo_t clkMonList[MAX_CLK_DOMAINS];
++} nvmlClkMonStatus_t;
++
++/**
++ * ECC bit types.
++ *
++ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
++ */
++#define nvmlEccBitType_t nvmlMemoryErrorType_t
++
++/**
++ * Single bit ECC errors
++ *
++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
++ */
++#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
++
++/**
++ * Double bit ECC errors
++ *
++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
++ */
++#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
++
++/**
++ * Memory error types
++ */
++typedef enum nvmlMemoryErrorType_enum
++{
++    /**
++     * A memory error that was corrected
++     *
++     * For ECC errors, these are single bit errors
++     * For Texture memory, these are errors fixed by resend
++     */
++    NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
++    /**
++     * A memory error that was not corrected
++     *
++     * For ECC errors, these are double bit errors
++     * For Texture memory, these are errors where the resend fails
++     */
++    NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
++
++
++    // Keep this last
++    NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
++
++} nvmlMemoryErrorType_t;
++
++/**
++ * ECC counter types.
++ *
++ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
++ *       On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver
++ *       client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
++ *       is run.
++ */
++typedef enum nvmlEccCounterType_enum
++{
++    NVML_VOLATILE_ECC      = 0,      //!< Volatile counts are reset each time the driver loads.
++    NVML_AGGREGATE_ECC     = 1,      //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
++
++    // Keep this last
++    NVML_ECC_COUNTER_TYPE_COUNT      //!< Count of memory counter types
++} nvmlEccCounterType_t;
++
++/**
++ * Clock types.
++ *
++ * All speeds are in Mhz.
++ */
++typedef enum nvmlClockType_enum
++{
++    NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
++    NVML_CLOCK_SM        = 1,        //!< SM clock domain
++    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
++    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
++
++    // Keep this last
++    NVML_CLOCK_COUNT //!< Count of clock types
++} nvmlClockType_t;
++
++/**
++ * Clock Ids.  These are used in combination with nvmlClockType_t
++ * to specify a single clock value.
++ */
++typedef enum nvmlClockId_enum
++{
++    NVML_CLOCK_ID_CURRENT            = 0,   //!< Current actual clock value
++    NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1,   //!< Target application clock
++    NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2,   //!< Default application clock target
++    NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3,   //!< OEM-defined maximum clock rate
++
++    //Keep this last
++    NVML_CLOCK_ID_COUNT //!< Count of Clock Ids.
++} nvmlClockId_t;
++
++/**
++ * Driver models.
++ *
++ * Windows only.
++ */
++
++typedef enum nvmlDriverModel_enum
++{
++    NVML_DRIVER_WDDM      = 0,       //!< WDDM driver model -- GPU treated as a display device
++    NVML_DRIVER_WDM       = 1,       //!< WDM (TCC) model (deprecated) -- GPU treated as a generic compute device
++    NVML_DRIVER_MCDM      = 2        //!< MCDM driver model -- GPU treated as a Microsoft compute device
++} nvmlDriverModel_t;
++
++#define NVML_MAX_GPU_PERF_PSTATES 16
++
++/**
++ * Allowed PStates.
++ */
++typedef enum nvmlPStates_enum
++{
++    NVML_PSTATE_0               = 0,       //!< Performance state 0 -- Maximum Performance
++    NVML_PSTATE_1               = 1,       //!< Performance state 1
++    NVML_PSTATE_2               = 2,       //!< Performance state 2
++    NVML_PSTATE_3               = 3,       //!< Performance state 3
++    NVML_PSTATE_4               = 4,       //!< Performance state 4
++    NVML_PSTATE_5               = 5,       //!< Performance state 5
++    NVML_PSTATE_6               = 6,       //!< Performance state 6
++    NVML_PSTATE_7               = 7,       //!< Performance state 7
++    NVML_PSTATE_8               = 8,       //!< Performance state 8
++    NVML_PSTATE_9               = 9,       //!< Performance state 9
++    NVML_PSTATE_10              = 10,      //!< Performance state 10
++    NVML_PSTATE_11              = 11,      //!< Performance state 11
++    NVML_PSTATE_12              = 12,      //!< Performance state 12
++    NVML_PSTATE_13              = 13,      //!< Performance state 13
++    NVML_PSTATE_14              = 14,      //!< Performance state 14
++    NVML_PSTATE_15              = 15,      //!< Performance state 15 -- Minimum Performance
++    NVML_PSTATE_UNKNOWN         = 32       //!< Unknown performance state
++} nvmlPstates_t;
++
++/**
++ * Clock offset info.
++ */
++typedef struct
++{
++    unsigned int version; //!< The version number of this struct
++    nvmlClockType_t type;
++    nvmlPstates_t pstate;
++    int clockOffsetMHz;
++    int minClockOffsetMHz;
++    int maxClockOffsetMHz;
++} nvmlClockOffset_v1_t;
++
++typedef nvmlClockOffset_v1_t nvmlClockOffset_t;
++
++#define nvmlClockOffset_v1 NVML_STRUCT_VERSION(ClockOffset, 1)
++
++/**
++ * GPU Operation Mode
++ *
++ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
++ *
++ * Each GOM is designed to meet specific user needs.
++ */
++typedef enum nvmlGom_enum
++{
++    NVML_GOM_ALL_ON                    = 0, //!< Everything is enabled and running at full speed
++
++    NVML_GOM_COMPUTE                   = 1, //!< Designed for running only compute tasks. Graphics operations
++                                            //!< are not allowed
++
++    NVML_GOM_LOW_DP                    = 2  //!< Designed for running graphics applications that don't require
++                                            //!< high bandwidth double precision
++} nvmlGpuOperationMode_t;
++
++/**
++ * Available infoROM objects.
++ */
++typedef enum nvmlInforomObject_enum
++{
++    NVML_INFOROM_OEM            = 0,       //!< An object defined by OEM
++    NVML_INFOROM_ECC            = 1,       //!< The ECC object determining the level of ECC support
++    NVML_INFOROM_POWER          = 2,       //!< The power management object
++
++    // Keep this last
++    NVML_INFOROM_COUNT                     //!< This counts the number of infoROM objects the driver knows about
++} nvmlInforomObject_t;
++
++/**
++ * Return values for NVML API calls.
++ */
++typedef enum nvmlReturn_enum
++{
++    // cppcheck-suppress *
++    NVML_SUCCESS = 0,                          //!< The operation was successful
++    NVML_ERROR_UNINITIALIZED = 1,              //!< NVML was not first initialized with nvmlInit()
++    NVML_ERROR_INVALID_ARGUMENT = 2,           //!< A supplied argument is invalid
++    NVML_ERROR_NOT_SUPPORTED = 3,              //!< The requested operation is not available on target device
++    NVML_ERROR_NO_PERMISSION = 4,              //!< The current user does not have permission for operation
++    NVML_ERROR_ALREADY_INITIALIZED = 5,        //!< Deprecated: Multiple initializations are now allowed through ref counting
++    NVML_ERROR_NOT_FOUND = 6,                  //!< A query to find an object was unsuccessful
++    NVML_ERROR_INSUFFICIENT_SIZE = 7,          //!< An input argument is not large enough
++    NVML_ERROR_INSUFFICIENT_POWER = 8,         //!< A device's external power cables are not properly attached
++    NVML_ERROR_DRIVER_NOT_LOADED = 9,          //!< NVIDIA driver is not loaded
++    NVML_ERROR_TIMEOUT = 10,                   //!< User provided timeout passed
++    NVML_ERROR_IRQ_ISSUE = 11,                 //!< NVIDIA Kernel detected an interrupt issue with a GPU
++    NVML_ERROR_LIBRARY_NOT_FOUND = 12,         //!< NVML Shared Library couldn't be found or loaded
++    NVML_ERROR_FUNCTION_NOT_FOUND = 13,        //!< Local version of NVML doesn't implement this function
++    NVML_ERROR_CORRUPTED_INFOROM = 14,         //!< infoROM is corrupted
++    NVML_ERROR_GPU_IS_LOST = 15,               //!< The GPU has fallen off the bus or has otherwise become inaccessible
++    NVML_ERROR_RESET_REQUIRED = 16,            //!< The GPU requires a reset before it can be used again
++    NVML_ERROR_OPERATING_SYSTEM = 17,          //!< The GPU control device has been blocked by the operating system/cgroups
++    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
++    NVML_ERROR_IN_USE = 19,                    //!< An operation cannot be performed because the GPU is currently in use
++    NVML_ERROR_MEMORY = 20,                    //!< Insufficient memory
++    NVML_ERROR_NO_DATA = 21,                   //!< No data
++    NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,    //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
++    NVML_ERROR_INSUFFICIENT_RESOURCES = 23,    //!< Ran out of critical resources, other than memory
++    NVML_ERROR_FREQ_NOT_SUPPORTED = 24,        //!< Ran out of critical resources, other than memory
++    NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported
++    NVML_ERROR_DEPRECATED  = 26,               //!< The requested functionality has been deprecated
++    NVML_ERROR_NOT_READY = 27,                 //!< The system is not ready for the request
++    NVML_ERROR_GPU_NOT_FOUND = 28,             //!< No GPUs were found
++    NVML_ERROR_INVALID_STATE = 29,             //!< Resource not in correct state to perform requested operation
++    NVML_ERROR_UNKNOWN = 999                   //!< An internal driver error occurred
++} nvmlReturn_t;
++
++/**
++ * See \ref nvmlDeviceGetMemoryErrorCounter
++ */
++typedef enum nvmlMemoryLocation_enum
++{
++    NVML_MEMORY_LOCATION_L1_CACHE        = 0,    //!< GPU L1 Cache
++    NVML_MEMORY_LOCATION_L2_CACHE        = 1,    //!< GPU L2 Cache
++    NVML_MEMORY_LOCATION_DRAM            = 2,    //!< Turing+ DRAM
++    NVML_MEMORY_LOCATION_DEVICE_MEMORY   = 2,    //!< GPU Device Memory
++    NVML_MEMORY_LOCATION_REGISTER_FILE   = 3,    //!< GPU Register File
++    NVML_MEMORY_LOCATION_TEXTURE_MEMORY  = 4,    //!< GPU Texture Memory
++    NVML_MEMORY_LOCATION_TEXTURE_SHM     = 5,    //!< Shared memory
++    NVML_MEMORY_LOCATION_CBU             = 6,    //!< CBU
++    NVML_MEMORY_LOCATION_SRAM            = 7,    //!< Turing+ SRAM
++    // Keep this last
++    NVML_MEMORY_LOCATION_COUNT              //!< This counts the number of memory locations the driver knows about
++} nvmlMemoryLocation_t;
++
++/**
++ * Causes for page retirement
++ */
++typedef enum nvmlPageRetirementCause_enum
++{
++    NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error
++    NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1,           //!< Page was retired due to double bit ECC error
++
++    // Keep this last
++    NVML_PAGE_RETIREMENT_CAUSE_COUNT
++} nvmlPageRetirementCause_t;
++
++/**
++ * API types that allow changes to default permission restrictions
++ */
++typedef enum nvmlRestrictedAPI_enum
++{
++    NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0,   //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks
++                                                      //!< and see nvmlDeviceResetApplicationsClocks
++    NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1,  //!< APIs that enable/disable Auto Boosted clocks
++                                                      //!< see nvmlDeviceSetAutoBoostedClocksEnabled
++    // Keep this last
++    NVML_RESTRICTED_API_COUNT
++} nvmlRestrictedAPI_t;
++
++/**
++ * Structure to store utilization value and process Id
++ */
++typedef struct nvmlProcessUtilizationSample_st
++{
++    unsigned int        pid;            //!< PID of process
++    unsigned long long  timeStamp;      //!< CPU Timestamp in microseconds
++    unsigned int        smUtil;         //!< SM (3D/Compute) Util Value
++    unsigned int        memUtil;        //!< Frame Buffer Memory Util Value
++    unsigned int        encUtil;        //!< Encoder Util Value
++    unsigned int        decUtil;        //!< Decoder Util Value
++} nvmlProcessUtilizationSample_t;
++
++/**
++ * Structure to store utilization value and process Id -- version 1
++ */
++typedef struct
++{
++    unsigned long long  timeStamp;      //!< CPU Timestamp in microseconds
++    unsigned int        pid;            //!< PID of process
++    unsigned int        smUtil;         //!< SM (3D/Compute) Util Value
++    unsigned int        memUtil;        //!< Frame Buffer Memory Util Value
++    unsigned int        encUtil;        //!< Encoder Util Value
++    unsigned int        decUtil;        //!< Decoder Util Value
++    unsigned int        jpgUtil;        //!< Jpeg Util Value
++    unsigned int        ofaUtil;        //!< Ofa Util Value
++} nvmlProcessUtilizationInfo_v1_t;
++
++/**
++ * Structure to store utilization and process ID for each running process -- version 1
++ */
++typedef struct
++{
++    unsigned int version;                           //!< The version number of this struct
++    unsigned int processSamplesCount;               //!< Caller-supplied array size, and returns number of processes running
++    unsigned long long lastSeenTimeStamp;           //!< Return only samples with timestamp greater than lastSeenTimeStamp
++    nvmlProcessUtilizationInfo_v1_t *procUtilArray; //!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA
++} nvmlProcessesUtilizationInfo_v1_t;
++typedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t;
++#define nvmlProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(ProcessesUtilizationInfo, 1)
++
++/**
++ * Structure to store SRAM uncorrectable error counters
++ */
++typedef struct
++{
++    unsigned int version;                                   //!< the API version number
++    unsigned long long aggregateUncParity;                  //!< aggregate uncorrectable parity error count
++    unsigned long long aggregateUncSecDed;                  //!< aggregate uncorrectable SEC-DED error count
++    unsigned long long aggregateCor;                        //!< aggregate correctable error count
++    unsigned long long volatileUncParity;                   //!< volatile uncorrectable parity error count
++    unsigned long long volatileUncSecDed;                   //!< volatile uncorrectable SEC-DED error count
++    unsigned long long volatileCor;                         //!< volatile correctable error count
++    unsigned long long aggregateUncBucketL2;                //!< aggregate uncorrectable error count for L2 cache bucket
++    unsigned long long aggregateUncBucketSm;                //!< aggregate uncorrectable error count for SM bucket
++    unsigned long long aggregateUncBucketPcie;              //!< aggregate uncorrectable error count for PCIE bucket
++    unsigned long long aggregateUncBucketMcu;               //!< aggregate uncorrectable error count for Microcontroller bucket
++    unsigned long long aggregateUncBucketOther;             //!< aggregate uncorrectable error count for Other bucket
++    unsigned int bThresholdExceeded;                        //!< if the error threshold of field diag is exceeded
++} nvmlEccSramErrorStatus_v1_t;
++
++typedef nvmlEccSramErrorStatus_v1_t nvmlEccSramErrorStatus_t;
++#define nvmlEccSramErrorStatus_v1 NVML_STRUCT_VERSION(EccSramErrorStatus, 1)
++
++/**
++ * GSP firmware
++ */
++#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40
++
++/**
++ * Simplified chip architecture
++ */
++#define NVML_DEVICE_ARCH_KEPLER    2 // Devices based on the NVIDIA Kepler architecture
++#define NVML_DEVICE_ARCH_MAXWELL   3 // Devices based on the NVIDIA Maxwell architecture
++#define NVML_DEVICE_ARCH_PASCAL    4 // Devices based on the NVIDIA Pascal architecture
++#define NVML_DEVICE_ARCH_VOLTA     5 // Devices based on the NVIDIA Volta architecture
++#define NVML_DEVICE_ARCH_TURING    6 // Devices based on the NVIDIA Turing architecture
++#define NVML_DEVICE_ARCH_AMPERE    7 // Devices based on the NVIDIA Ampere architecture
++#define NVML_DEVICE_ARCH_ADA       8 // Devices based on the NVIDIA Ada architecture
++#define NVML_DEVICE_ARCH_HOPPER    9 // Devices based on the NVIDIA Hopper architecture
++
++#define NVML_DEVICE_ARCH_BLACKWELL 10 // Devices based on the NVIDIA Blackwell architecture
++
++#define NVML_DEVICE_ARCH_T23X      11 // Devices based on NVIDIA Orin architecture
++
++#define NVML_DEVICE_ARCH_UNKNOWN   0xffffffff // Anything else, presumably something newer
++
++typedef unsigned int nvmlDeviceArchitecture_t;
++
++/**
++ * PCI bus types
++ */
++#define NVML_BUS_TYPE_UNKNOWN  0
++#define NVML_BUS_TYPE_PCI      1
++#define NVML_BUS_TYPE_PCIE     2
++#define NVML_BUS_TYPE_FPCI     3
++#define NVML_BUS_TYPE_AGP      4
++
++typedef unsigned int nvmlBusType_t;
++
++/**
++ * Device Power Modes
++ */
++
++/**
++ * Device Fan control policy
++ */
++#define NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW 0
++#define NVML_FAN_POLICY_MANUAL                   1
++
++typedef unsigned int nvmlFanControlPolicy_t;
++
++/**
++ * Device Power Source
++ */
++#define NVML_POWER_SOURCE_AC         0x00000000
++#define NVML_POWER_SOURCE_BATTERY    0x00000001
++#define NVML_POWER_SOURCE_UNDERSIZED 0x00000002
++
++typedef unsigned int nvmlPowerSource_t;
++
++/*
++ * Device PCIE link Max Speed
++ */
++#define NVML_PCIE_LINK_MAX_SPEED_INVALID   0x00000000
++#define NVML_PCIE_LINK_MAX_SPEED_2500MBPS  0x00000001
++#define NVML_PCIE_LINK_MAX_SPEED_5000MBPS  0x00000002
++#define NVML_PCIE_LINK_MAX_SPEED_8000MBPS  0x00000003
++#define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004
++#define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005
++#define NVML_PCIE_LINK_MAX_SPEED_64000MBPS 0x00000006
++
++/*
++ * Adaptive clocking status
++ */
++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000
++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED  0x00000001
++
++#define NVML_MAX_GPU_UTILIZATIONS 8
++
++/**
++ * Represents the GPU utilization domains
++ */
++typedef enum nvmlGpuUtilizationDomainId_t
++{
++    NVML_GPU_UTILIZATION_DOMAIN_GPU    = 0, //!< Graphics engine domain
++    NVML_GPU_UTILIZATION_DOMAIN_FB     = 1, //!< Frame buffer domain
++    NVML_GPU_UTILIZATION_DOMAIN_VID    = 2, //!< Video engine domain
++    NVML_GPU_UTILIZATION_DOMAIN_BUS    = 3, //!< Bus interface domain
++} nvmlGpuUtilizationDomainId_t;
++
++typedef struct nvmlGpuDynamicPstatesInfo_st
++{
++    unsigned int       flags;          //!< Reserved for future use
++    struct
++    {
++        unsigned int   bIsPresent;     //!< Set if this utilization domain is present on this GPU
++        unsigned int   percentage;     //!< Percentage of time where the domain is considered busy in the last 1-second interval
++        unsigned int   incThreshold;   //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed
++        unsigned int   decThreshold;   //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed
++    } utilization[NVML_MAX_GPU_UTILIZATIONS];
++} nvmlGpuDynamicPstatesInfo_t;
++
++/*
++ * PCIe outbound/inbound atomic operations capability
++ */
++#define NVML_PCIE_ATOMICS_CAP_FETCHADD32  0x01
++#define NVML_PCIE_ATOMICS_CAP_FETCHADD64  0x02
++#define NVML_PCIE_ATOMICS_CAP_SWAP32      0x04
++#define NVML_PCIE_ATOMICS_CAP_SWAP64      0x08
++#define NVML_PCIE_ATOMICS_CAP_CAS32       0x10
++#define NVML_PCIE_ATOMICS_CAP_CAS64       0x20
++#define NVML_PCIE_ATOMICS_CAP_CAS128      0x40
++#define NVML_PCIE_ATOMICS_OPS_MAX         7
++
++/** @} */
++
++/***************************************************************************************************/
++/** @addtogroup virtualGPU vGPU Enums, Constants, Structs
++ *  @{
++ */
++/***************************************************************************************************/
++/** @defgroup nvmlVirtualGpuEnums vGPU Enums
++ *  @{
++ */
++/***************************************************************************************************/
++
++/*!
++ * GPU virtualization mode types.
++ */
++typedef enum nvmlGpuVirtualizationMode {
++    NVML_GPU_VIRTUALIZATION_MODE_NONE = 0,  //!< Represents Bare Metal GPU
++    NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1,  //!< Device is associated with GPU-Passthorugh
++    NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2,  //!< Device is associated with vGPU inside virtual machine.
++    NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3,  //!< Device is associated with VGX hypervisor in vGPU mode
++    NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4   //!< Device is associated with VGX hypervisor in vSGA mode
++} nvmlGpuVirtualizationMode_t;
++
++/**
++ * Host vGPU modes
++ */
++typedef enum nvmlHostVgpuMode_enum
++{
++    NVML_HOST_VGPU_MODE_NON_SRIOV    = 0,     //!< Non SR-IOV mode
++    NVML_HOST_VGPU_MODE_SRIOV        = 1      //!< SR-IOV mode
++} nvmlHostVgpuMode_t;
++
++/*!
++ * Types of VM identifiers
++ */
++typedef enum nvmlVgpuVmIdType {
++    NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID
++    NVML_VGPU_VM_ID_UUID = 1       //!< VM ID represents UUID
++} nvmlVgpuVmIdType_t;
++
++/**
++ * vGPU GUEST info state
++ */
++typedef enum nvmlVgpuGuestInfoState_enum
++{
++    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0,  //!< Guest-dependent fields uninitialized
++    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED   = 1   //!< Guest-dependent fields initialized
++} nvmlVgpuGuestInfoState_t;
++
++/**
++ * vGPU software licensable features
++ */
++typedef enum {
++    NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN      = 0,                                         //!< Unknown
++    NVML_GRID_LICENSE_FEATURE_CODE_VGPU         = 1,                                         //!< Virtual GPU
++    NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX   = 2,                                         //!< Nvidia RTX
++    NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX, //!< Deprecated, do not use.
++    NVML_GRID_LICENSE_FEATURE_CODE_GAMING       = 3,                                         //!< Gaming
++    NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE      = 4                                          //!< Compute
++} nvmlGridLicenseFeatureCode_t;
++
++/**
++ * Status codes for license expiry
++ */
++#define NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE   0   //!< Expiry information not available
++#define NVML_GRID_LICENSE_EXPIRY_INVALID         1   //!< Invalid expiry or error fetching expiry
++#define NVML_GRID_LICENSE_EXPIRY_VALID           2   //!< Valid expiry
++#define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE  3   //!< Expiry not applicable
++#define NVML_GRID_LICENSE_EXPIRY_PERMANENT       4   //!< Permanent expiry
++
++/**
++ * vGPU queryable capabilities
++ */
++typedef enum nvmlVgpuCapability_enum
++{
++    NVML_VGPU_CAP_NVLINK_P2P                    = 0,  //!< P2P over NVLink is supported
++    NVML_VGPU_CAP_GPUDIRECT                     = 1,  //!< GPUDirect capability is supported
++    NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE          = 2,  //!< vGPU profile cannot be mixed with other vGPU profiles in same VM
++    NVML_VGPU_CAP_EXCLUSIVE_TYPE                = 3,  //!< vGPU profile cannot run on a GPU alongside other profiles of different type
++    NVML_VGPU_CAP_EXCLUSIVE_SIZE                = 4,  //!< vGPU profile cannot run on a GPU alongside other profiles of different size
++    // Keep this last
++    NVML_VGPU_CAP_COUNT
++} nvmlVgpuCapability_t;
++
++/**
++* vGPU driver queryable capabilities
++*/
++typedef enum nvmlVgpuDriverCapability_enum
++{
++    NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0,      //!< Supports mixing of different vGPU profiles within one guest VM
++    NVML_VGPU_DRIVER_CAP_WARM_UPDATE              = 1,      //!< Supports FSR and warm update of vGPU host driver without terminating the running guest VM
++    // Keep this last
++    NVML_VGPU_DRIVER_CAP_COUNT
++} nvmlVgpuDriverCapability_t;
++
++/**
++* Device vGPU queryable capabilities
++*/
++typedef enum nvmlDeviceVgpuCapability_enum
++{
++    NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU            = 0,    //!< Query if the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
++    NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1,    //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing types
++    NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES    = 2,    //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes
++    NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW            = 3,    //!< Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second
++    NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW           = 4,    //!< Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second
++    NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING                 = 5,    //!< Query if vGPU profiles on the GPU supports migration data streaming
++    NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU                 = 6,    //!< Set/Get support for mini-quarter vGPU profiles
++    NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU         = 7,    //!< Set/Get support for compute media engine vGPU profiles
++    NVML_DEVICE_VGPU_CAP_WARM_UPDATE                      = 8,    //!< Query if the GPU supports FSR and warm update
++    // Keep this last
++    NVML_DEVICE_VGPU_CAP_COUNT
++} nvmlDeviceVgpuCapability_t;
++
++/** @} */
++
++/***************************************************************************************************/
++
++/** @defgroup nvmlVgpuConstants vGPU Constants
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense
++ */
++#define NVML_GRID_LICENSE_BUFFER_SIZE       128
++
++#define NVML_VGPU_NAME_BUFFER_SIZE          64
++
++#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3
++
++#define INVALID_GPU_INSTANCE_PROFILE_ID     0xFFFFFFFF
++
++#define INVALID_GPU_INSTANCE_ID             0xFFFFFFFF
++
++#define NVML_INVALID_VGPU_PLACEMENT_ID      0xFFFF
++
++/*!
++ * Macros for vGPU instance's virtualization capabilities bitfield.
++ */
++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION         0:0
++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO      0x0
++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES     0x1
++
++/*!
++ * Macros for pGPU's virtualization capabilities bitfield.
++ */
++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION         0:0
++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO      0x0
++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES     0x1
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlVgpuStructs vGPU Structs
++ *  @{
++ */
++/***************************************************************************************************/
++
++typedef unsigned int nvmlVgpuTypeId_t;
++
++typedef unsigned int nvmlVgpuInstance_t;
++
++/**
++ * Structure to store the vGPU heterogeneous mode of device -- version 1
++ */
++typedef struct
++{
++    unsigned int version;               //!< The version number of this struct
++    unsigned int mode;                  //!< The vGPU heterogeneous mode
++} nvmlVgpuHeterogeneousMode_v1_t;
++typedef nvmlVgpuHeterogeneousMode_v1_t nvmlVgpuHeterogeneousMode_t;
++#define nvmlVgpuHeterogeneousMode_v1 NVML_STRUCT_VERSION(VgpuHeterogeneousMode, 1)
++
++/**
++ * Structure to store the placement ID of vGPU instance -- version 1
++ */
++typedef struct
++{
++    unsigned int version;               //!< The version number of this struct
++    unsigned int placementId;           //!< Placement ID of the active vGPU instance
++} nvmlVgpuPlacementId_v1_t;
++typedef nvmlVgpuPlacementId_v1_t nvmlVgpuPlacementId_t;
++#define nvmlVgpuPlacementId_v1 NVML_STRUCT_VERSION(VgpuPlacementId, 1)
++
++/**
++ * Structure to store the list of vGPU placements -- version 1
++ */
++typedef struct
++{
++    unsigned int version;               //!< The version number of this struct
++    unsigned int placementSize;         //!< The number of slots occupied by the vGPU type
++    unsigned int count;                 //!< Count of placement IDs fetched
++    unsigned int *placementIds;         //!< Placement IDs for the vGPU type
++} nvmlVgpuPlacementList_v1_t;
++typedef nvmlVgpuPlacementList_v1_t nvmlVgpuPlacementList_t;
++#define nvmlVgpuPlacementList_v1 NVML_STRUCT_VERSION(VgpuPlacementList, 1)
++
++/**
++ * Structure to store BAR1 size information of vGPU type -- Version 1
++ */
++typedef struct
++{
++    unsigned int version;               //!< The version number of this struct
++    unsigned long long  bar1Size;       //!< BAR1 size in megabytes
++} nvmlVgpuTypeBar1Info_v1_t;
++typedef nvmlVgpuTypeBar1Info_v1_t nvmlVgpuTypeBar1Info_t;
++#define nvmlVgpuTypeBar1Info_v1 NVML_STRUCT_VERSION(VgpuTypeBar1Info, 1)
++
++/**
++ * Structure to store Utilization Value and vgpuInstance
++ */
++typedef struct nvmlVgpuInstanceUtilizationSample_st
++{
++    nvmlVgpuInstance_t  vgpuInstance;       //!< vGPU Instance
++    unsigned long long  timeStamp;          //!< CPU Timestamp in microseconds
++    nvmlValue_t         smUtil;             //!< SM (3D/Compute) Util Value
++    nvmlValue_t         memUtil;            //!< Frame Buffer Memory Util Value
++    nvmlValue_t         encUtil;            //!< Encoder Util Value
++    nvmlValue_t         decUtil;            //!< Decoder Util Value
++} nvmlVgpuInstanceUtilizationSample_t;
++
++/**
++ * Structure to store Utilization Value and vgpuInstance Info -- Version 1
++ */
++typedef struct
++{
++    unsigned long long  timeStamp;          //!< CPU Timestamp in microseconds
++    nvmlVgpuInstance_t  vgpuInstance;       //!< vGPU Instance
++    nvmlValue_t         smUtil;             //!< SM (3D/Compute) Util Value
++    nvmlValue_t         memUtil;            //!< Frame Buffer Memory Util Value
++    nvmlValue_t         encUtil;            //!< Encoder Util Value
++    nvmlValue_t         decUtil;            //!< Decoder Util Value
++    nvmlValue_t         jpgUtil;            //!< Jpeg Util Value
++    nvmlValue_t         ofaUtil;            //!< Ofa Util Value
++} nvmlVgpuInstanceUtilizationInfo_v1_t;
++
++/**
++ * Structure to store recent utilization for vGPU instances running on a device -- version 1
++ */
++typedef struct
++{
++    unsigned int version;                                   //!< The version number of this struct
++    nvmlValueType_t sampleValType;                          //!< Hold the type of returned sample values
++    unsigned int vgpuInstanceCount;                         //!< Hold the number of vGPU instances
++    unsigned long long lastSeenTimeStamp;                   //!< Return only samples with timestamp greater than lastSeenTimeStamp
++    nvmlVgpuInstanceUtilizationInfo_v1_t *vgpuUtilArray;    //!< The array (allocated by caller) in which vGPU utilization are returned
++} nvmlVgpuInstancesUtilizationInfo_v1_t;
++typedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t;
++#define nvmlVgpuInstancesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuInstancesUtilizationInfo, 1)
++
++/**
++ * Structure to store Utilization Value, vgpuInstance and subprocess information
++ */
++typedef struct nvmlVgpuProcessUtilizationSample_st
++{
++    nvmlVgpuInstance_t  vgpuInstance;                               //!< vGPU Instance
++    unsigned int        pid;                                        //!< PID of process running within the vGPU VM
++    char                processName[NVML_VGPU_NAME_BUFFER_SIZE];    //!< Name of process running within the vGPU VM
++    unsigned long long  timeStamp;                                  //!< CPU Timestamp in microseconds
++    unsigned int        smUtil;                                     //!< SM (3D/Compute) Util Value
++    unsigned int        memUtil;                                    //!< Frame Buffer Memory Util Value
++    unsigned int        encUtil;                                    //!< Encoder Util Value
++    unsigned int        decUtil;                                    //!< Decoder Util Value
++} nvmlVgpuProcessUtilizationSample_t;
++
++/**
++ * Structure to store Utilization Value, vgpuInstance and subprocess information for process running on vGPU instance -- version 1
++ */
++typedef struct
++{
++    char                processName[NVML_VGPU_NAME_BUFFER_SIZE];    //!< Name of process running within the vGPU VM
++    unsigned long long  timeStamp;                                  //!< CPU Timestamp in microseconds
++    nvmlVgpuInstance_t  vgpuInstance;                               //!< vGPU Instance
++    unsigned int        pid;                                        //!< PID of process running within the vGPU VM
++    unsigned int        smUtil;                                     //!< SM (3D/Compute) Util Value
++    unsigned int        memUtil;                                    //!< Frame Buffer Memory Util Value
++    unsigned int        encUtil;                                    //!< Encoder Util Value
++    unsigned int        decUtil;                                    //!< Decoder Util Value
++    unsigned int        jpgUtil;                                    //!< Jpeg Util Value
++    unsigned int        ofaUtil;                                    //!< Ofa Util Value
++} nvmlVgpuProcessUtilizationInfo_v1_t;
++
++/**
++ * Structure to store recent utilization, vgpuInstance and subprocess information for processes running on vGPU instances active on a device -- version 1
++ */
++typedef struct
++{
++    unsigned int version;                                   //!< The version number of this struct
++    unsigned int vgpuProcessCount;                          //!< Hold the number of processes running on vGPU instances
++    unsigned long long lastSeenTimeStamp;                   //!< Return only samples with timestamp greater than lastSeenTimeStamp
++    nvmlVgpuProcessUtilizationInfo_v1_t *vgpuProcUtilArray; //!< The array (allocated by caller) in which utilization of processes running on vGPU instances are returned
++} nvmlVgpuProcessesUtilizationInfo_v1_t;
++typedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t;
++#define nvmlVgpuProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuProcessesUtilizationInfo, 1)
++
++/**
++ * vGPU scheduler policies
++ */
++#define NVML_VGPU_SCHEDULER_POLICY_UNKNOWN      0
++#define NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT  1
++#define NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE  2
++#define NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE  3
++
++#define NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT 3
++
++#define NVML_SCHEDULER_SW_MAX_LOG_ENTRIES 200
++
++#define NVML_VGPU_SCHEDULER_ARR_DEFAULT   0
++#define NVML_VGPU_SCHEDULER_ARR_DISABLE   1
++#define NVML_VGPU_SCHEDULER_ARR_ENABLE    2
++
++/**
++ * Union to represent the vGPU Scheduler Parameters
++ */
++typedef union
++{
++    struct
++    {
++        unsigned int    avgFactor;          //!< Average factor in compensating the timeslice for Adaptive Round Robin mode
++        unsigned int    timeslice;          //!< The timeslice in ns for each software run list as configured, or the default value otherwise
++    } vgpuSchedDataWithARR;
++
++    struct
++    {
++        unsigned int    timeslice;          //!< The timeslice in ns for each software run list as configured, or the default value otherwise
++    } vgpuSchedData;
++
++} nvmlVgpuSchedulerParams_t;
++
++/**
++ * Structure to store the state and logs of a software runlist
++ */
++typedef struct nvmlVgpuSchedulerLogEntries_st
++{
++    unsigned long long          timestamp;                  //!< Timestamp in ns when this software runlist was preeempted
++    unsigned long long          timeRunTotal;               //!< Total time in ns this software runlist has run
++    unsigned long long          timeRun;                    //!< Time in ns this software runlist ran before preemption
++    unsigned int                swRunlistId;                //!< Software runlist Id
++    unsigned long long          targetTimeSlice;            //!< The actual timeslice after deduction
++    unsigned long long          cumulativePreemptionTime;   //!< Preemption time in ns for this SW runlist
++} nvmlVgpuSchedulerLogEntry_t;
++
++/**
++ * Structure to store a vGPU software scheduler log
++ */
++typedef struct nvmlVgpuSchedulerLog_st
++{
++    unsigned int                engineId;                                       //!< Engine whose software runlist log entries are fetched
++    unsigned int                schedulerPolicy;                                //!< Scheduler policy
++    unsigned int                arrMode;                                        //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*.
++    nvmlVgpuSchedulerParams_t   schedulerParams;
++    unsigned int                entriesCount;                                   //!< Count of log entries fetched
++    nvmlVgpuSchedulerLogEntry_t logEntries[NVML_SCHEDULER_SW_MAX_LOG_ENTRIES];
++} nvmlVgpuSchedulerLog_t;
++
++/**
++ * Structure to store the vGPU scheduler state
++ */
++typedef struct nvmlVgpuSchedulerGetState_st
++{
++    unsigned int                schedulerPolicy;    //!< Scheduler policy
++    unsigned int                arrMode;            //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*.
++    nvmlVgpuSchedulerParams_t   schedulerParams;
++} nvmlVgpuSchedulerGetState_t;
++
++/**
++ * Union to represent the vGPU Scheduler set Parameters
++ */
++typedef union
++{
++    struct
++    {
++        unsigned int    avgFactor;          //!< Average factor in compensating the timeslice for Adaptive Round Robin mode
++        unsigned int    frequency;          //!< Frequency for Adaptive Round Robin mode
++    } vgpuSchedDataWithARR;
++
++    struct
++    {
++        unsigned int    timeslice;          //!< The timeslice in ns(Nanoseconds) for each software run list as configured, or the default value otherwise
++    } vgpuSchedData;
++
++} nvmlVgpuSchedulerSetParams_t;
++
++/**
++ * Structure to set the vGPU scheduler state
++ */
++typedef struct nvmlVgpuSchedulerSetState_st
++{
++    unsigned int                    schedulerPolicy;    //!< Scheduler policy
++    unsigned int                    enableARRMode;      //!< Adaptive Round Robin scheduler
++    nvmlVgpuSchedulerSetParams_t    schedulerParams;
++} nvmlVgpuSchedulerSetState_t;
++
++/**
++ * Structure to store the vGPU scheduler capabilities
++ */
++typedef struct nvmlVgpuSchedulerCapabilities_st
++{
++    unsigned int        supportedSchedulers[NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT]; //!< List the supported vGPU schedulers on the device
++    unsigned int        maxTimeslice;                                                    //!< Maximum timeslice value in ns
++    unsigned int        minTimeslice;                                                    //!< Minimum timeslice value in ns
++    unsigned int        isArrModeSupported;                                              //!< Flag to check Adaptive Round Robin mode enabled/disabled.
++    unsigned int        maxFrequencyForARR;                                              //!< Maximum frequency for Adaptive Round Robin mode
++    unsigned int        minFrequencyForARR;                                              //!< Minimum frequency for Adaptive Round Robin mode
++    unsigned int        maxAvgFactorForARR;                                              //!< Maximum averaging factor for Adaptive Round Robin mode
++    unsigned int        minAvgFactorForARR;                                              //!< Minimum averaging factor for Adaptive Round Robin mode
++} nvmlVgpuSchedulerCapabilities_t;
++
++/**
++ * Structure to store the vGPU license expiry details
++ */
++typedef struct nvmlVgpuLicenseExpiry_st
++{
++    unsigned int    year;        //!< Year of license expiry
++    unsigned short  month;       //!< Month of license expiry
++    unsigned short  day;         //!< Day of license expiry
++    unsigned short  hour;        //!< Hour of license expiry
++    unsigned short  min;         //!< Minutes of license expiry
++    unsigned short  sec;         //!< Seconds of license expiry
++    unsigned char   status;      //!< License expiry status
++} nvmlVgpuLicenseExpiry_t;
++
++/**
++ * vGPU license state
++ */
++#define NVML_GRID_LICENSE_STATE_UNKNOWN                 0   //!< Unknown state
++#define NVML_GRID_LICENSE_STATE_UNINITIALIZED           1   //!< Uninitialized state
++#define NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED 2   //!< Unlicensed unrestricted state
++#define NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED   3   //!< Unlicensed restricted state
++#define NVML_GRID_LICENSE_STATE_UNLICENSED              4   //!< Unlicensed state
++#define NVML_GRID_LICENSE_STATE_LICENSED                5   //!< Licensed state
++
++typedef struct nvmlVgpuLicenseInfo_st
++{
++    unsigned char               isLicensed;     //!< License status
++    nvmlVgpuLicenseExpiry_t     licenseExpiry;  //!< License expiry information
++    unsigned int                currentState;   //!< Current license state
++} nvmlVgpuLicenseInfo_t;
++
++/**
++ * Structure to store license expiry date and time values
++ */
++typedef struct nvmlGridLicenseExpiry_st
++{
++    unsigned int   year;        //!< Year value of license expiry
++    unsigned short month;       //!< Month value of license expiry
++    unsigned short day;         //!< Day value of license expiry
++    unsigned short hour;        //!< Hour value of license expiry
++    unsigned short min;         //!< Minutes value of license expiry
++    unsigned short sec;         //!< Seconds value of license expiry
++    unsigned char  status;      //!< License expiry status
++} nvmlGridLicenseExpiry_t;
++
++/**
++ * Structure containing vGPU software licensable feature information
++ */
++typedef struct nvmlGridLicensableFeature_st
++{
++    nvmlGridLicenseFeatureCode_t    featureCode;                                 //!< Licensed feature code
++    unsigned int                    featureState;                                //!< Non-zero if feature is currently licensed, otherwise zero
++    char                            licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE];  //!< Deprecated.
++    char                            productName[NVML_GRID_LICENSE_BUFFER_SIZE];  //!< Product name of feature
++    unsigned int                    featureEnabled;                              //!< Non-zero if feature is enabled, otherwise zero
++    nvmlGridLicenseExpiry_t         licenseExpiry;                               //!< License expiry structure containing date and time
++} nvmlGridLicensableFeature_t;
++
++/**
++ * Structure to store vGPU software licensable features
++ */
++typedef struct nvmlGridLicensableFeatures_st
++{
++    int                         isGridLicenseSupported;                                       //!< Non-zero if vGPU Software Licensing is supported on the system, otherwise zero
++    unsigned int                licensableFeaturesCount;                                      //!< Entries returned in \a gridLicensableFeatures array
++    nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT];  //!< Array of vGPU software licensable features.
++} nvmlGridLicensableFeatures_t;
++
++/** @} */
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlFieldValueEnums Field Value Enums
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Field Identifiers.
++ *
++ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
++ */
++#define NVML_FI_DEV_ECC_CURRENT           1   //!< Current ECC mode. 1=Active. 0=Inactive
++#define NVML_FI_DEV_ECC_PENDING           2   //!< Pending ECC mode. 1=Active. 0=Inactive
++/* ECC Count Totals */
++#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL     3   //!< Total single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL     4   //!< Total double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL     5   //!< Total single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL     6   //!< Total double bit aggregate (persistent) ECC errors
++/* Individual ECC locations */
++#define NVML_FI_DEV_ECC_SBE_VOL_L1        7   //!< L1 cache single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_L1        8   //!< L1 cache double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_VOL_L2        9   //!< L2 cache single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_L2        10  //!< L2 cache double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_VOL_DEV       11  //!< Device memory single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_DEV       12  //!< Device memory double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_VOL_REG       13  //!< Register file single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_REG       14  //!< Register file double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_VOL_TEX       15  //!< Texture memory single bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_TEX       16  //!< Texture memory double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_DBE_VOL_CBU       17  //!< CBU double bit volatile ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_L1        18  //!< L1 cache single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_L1        19  //!< L1 cache double bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_L2        20  //!< L2 cache single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_L2        21  //!< L2 cache double bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_DEV       22  //!< Device memory single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_DEV       23  //!< Device memory double bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_REG       24  //!< Register File single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_REG       25  //!< Register File double bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_SBE_AGG_TEX       26  //!< Texture memory single bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_TEX       27  //!< Texture memory double bit aggregate (persistent) ECC errors
++#define NVML_FI_DEV_ECC_DBE_AGG_CBU       28  //!< CBU double bit aggregate ECC errors
++
++/* Page Retirement */
++#define NVML_FI_DEV_RETIRED_SBE           29  //!< Number of retired pages because of single bit errors
++#define NVML_FI_DEV_RETIRED_DBE           30  //!< Number of retired pages because of double bit errors
++#define NVML_FI_DEV_RETIRED_PENDING       31  //!< If any pages are pending retirement. 1=yes. 0=no.
++
++/* NvLink Flit Error Counters */
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0    32 //!< NVLink flow control CRC  Error Counter for Lane 0
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1    33 //!< NVLink flow control CRC  Error Counter for Lane 1
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2    34 //!< NVLink flow control CRC  Error Counter for Lane 2
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3    35 //!< NVLink flow control CRC  Error Counter for Lane 3
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4    36 //!< NVLink flow control CRC  Error Counter for Lane 4
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5    37 //!< NVLink flow control CRC  Error Counter for Lane 5
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC  Error Counter total for all Lanes
++
++/* NvLink CRC Data Error Counters */
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0    39 //!< NVLink data CRC Error Counter for Lane 0
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1    40 //!< NVLink data CRC Error Counter for Lane 1
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2    41 //!< NVLink data CRC Error Counter for Lane 2
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3    42 //!< NVLink data CRC Error Counter for Lane 3
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4    43 //!< NVLink data CRC Error Counter for Lane 4
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5    44 //!< NVLink data CRC Error Counter for Lane 5
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes
++
++/* NvLink Replay Error Counters */
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0      46 //!< NVLink Replay Error Counter for Lane 0
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1      47 //!< NVLink Replay Error Counter for Lane 1
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2      48 //!< NVLink Replay Error Counter for Lane 2
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3      49 //!< NVLink Replay Error Counter for Lane 3
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4      50 //!< NVLink Replay Error Counter for Lane 4
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5      51 //!< NVLink Replay Error Counter for Lane 5
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL   52 //!< NVLink Replay Error Counter total for all Lanes
++
++/* NvLink Recovery Error Counters */
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0    53 //!< NVLink Recovery Error Counter for Lane 0
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1    54 //!< NVLink Recovery Error Counter for Lane 1
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2    55 //!< NVLink Recovery Error Counter for Lane 2
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3    56 //!< NVLink Recovery Error Counter for Lane 3
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4    57 //!< NVLink Recovery Error Counter for Lane 4
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5    58 //!< NVLink Recovery Error Counter for Lane 5
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes
++
++/* NvLink Bandwidth Counters */
++/*
++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated.
++ * Please use the following field values instead:
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX
++ */
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0     60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1     61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2     62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3     63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4     64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5     65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL  66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes
++
++/* NvLink Bandwidth Counters */
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0     67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1     68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2     69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3     70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4     71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5     72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL  73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes
++
++/* NVML Perf Policy Counters */
++#define NVML_FI_DEV_PERF_POLICY_POWER              74   //!< Perf Policy Counter for Power Policy
++#define NVML_FI_DEV_PERF_POLICY_THERMAL            75   //!< Perf Policy Counter for Thermal Policy
++#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST         76   //!< Perf Policy Counter for Sync boost Policy
++#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT        77   //!< Perf Policy Counter for Board Limit
++#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION    78   //!< Perf Policy Counter for Low GPU Utilization Policy
++#define NVML_FI_DEV_PERF_POLICY_RELIABILITY        79   //!< Perf Policy Counter for Reliability Policy
++#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS   80   //!< Perf Policy Counter for Total App Clock Policy
++#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS  81   //!< Perf Policy Counter for Total Base Clocks Policy
++
++/* Memory temperatures */
++#define NVML_FI_DEV_MEMORY_TEMP  82 //!< Memory temperature for the device
++
++/* Energy Counter */
++#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded
++
++/* NVLink Speed */
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0     84  //!< NVLink Speed in MBps for Link 0
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1     85  //!< NVLink Speed in MBps for Link 1
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2     86  //!< NVLink Speed in MBps for Link 2
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3     87  //!< NVLink Speed in MBps for Link 3
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4     88  //!< NVLink Speed in MBps for Link 4
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5     89  //!< NVLink Speed in MBps for Link 5
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
++
++#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
++
++#define NVML_FI_DEV_RETIRED_PENDING_SBE      92  //!< If any pages are pending retirement due to SBE. 1=yes. 0=no.
++#define NVML_FI_DEV_RETIRED_PENDING_DBE      93  //!< If any pages are pending retirement due to DBE. 1=yes. 0=no.
++
++#define NVML_FI_DEV_PCIE_REPLAY_COUNTER             94  //!< PCIe replay counter
++#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER    95  //!< PCIe replay rollover counter
++
++/* NvLink Flit Error Counters */
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6     96 //!< NVLink flow control CRC  Error Counter for Lane 6
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7     97 //!< NVLink flow control CRC  Error Counter for Lane 7
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8     98 //!< NVLink flow control CRC  Error Counter for Lane 8
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9     99 //!< NVLink flow control CRC  Error Counter for Lane 9
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10   100 //!< NVLink flow control CRC  Error Counter for Lane 10
++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11   101 //!< NVLink flow control CRC  Error Counter for Lane 11
++
++/* NvLink CRC Data Error Counters */
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6    102 //!< NVLink data CRC Error Counter for Lane 6
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7    103 //!< NVLink data CRC Error Counter for Lane 7
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8    104 //!< NVLink data CRC Error Counter for Lane 8
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9    105 //!< NVLink data CRC Error Counter for Lane 9
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10   106 //!< NVLink data CRC Error Counter for Lane 10
++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11   107 //!< NVLink data CRC Error Counter for Lane 11
++
++/* NvLink Replay Error Counters */
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6      108 //!< NVLink Replay Error Counter for Lane 6
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7      109 //!< NVLink Replay Error Counter for Lane 7
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8      110 //!< NVLink Replay Error Counter for Lane 8
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9      111 //!< NVLink Replay Error Counter for Lane 9
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10     112 //!< NVLink Replay Error Counter for Lane 10
++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11     113 //!< NVLink Replay Error Counter for Lane 11
++
++/* NvLink Recovery Error Counters */
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6    114 //!< NVLink Recovery Error Counter for Lane 6
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7    115 //!< NVLink Recovery Error Counter for Lane 7
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8    116 //!< NVLink Recovery Error Counter for Lane 8
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9    117 //!< NVLink Recovery Error Counter for Lane 9
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10   118 //!< NVLink Recovery Error Counter for Lane 10
++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11   119 //!< NVLink Recovery Error Counter for Lane 11
++
++/* NvLink Bandwidth Counters */
++/*
++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated.
++ * Please use the following field values instead:
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX
++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX
++ */
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6     120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7     121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8     122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9     123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10    124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11    125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11
++
++/* NvLink Bandwidth Counters */
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6     126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7     127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8     128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9     129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10    130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10
++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11    131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11
++
++/* NVLink Speed */
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6     132  //!< NVLink Speed in MBps for Link 6
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7     133  //!< NVLink Speed in MBps for Link 7
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8     134  //!< NVLink Speed in MBps for Link 8
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9     135  //!< NVLink Speed in MBps for Link 9
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10    136  //!< NVLink Speed in MBps for Link 10
++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11    137  //!< NVLink Speed in MBps for Link 11
++
++/**
++ * NVLink throughput counters field values
++ *
++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
++ * A scopeId of UINT_MAX returns aggregate value summed up across all links
++ * for the specified counter type in fieldId.
++ */
++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX      138 //!< NVLink TX Data throughput in KiB
++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX      139 //!< NVLink RX Data throughput in KiB
++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX       140 //!< NVLink TX Data + protocol overhead in KiB
++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX       141 //!< NVLink RX Data + protocol overhead in KiB
++
++/* Row Remapper */
++#define NVML_FI_DEV_REMAPPED_COR        142 //!< Number of remapped rows due to correctable errors
++#define NVML_FI_DEV_REMAPPED_UNC        143 //!< Number of remapped rows due to uncorrectable errors
++#define NVML_FI_DEV_REMAPPED_PENDING    144 //!< If any rows are pending remapping. 1=yes 0=no
++#define NVML_FI_DEV_REMAPPED_FAILURE    145 //!< If any rows failed to be remapped 1=yes 0=no
++
++/**
++ * Remote device NVLink ID
++ *
++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
++ */
++#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID     146 //!< Remote device NVLink ID
++
++/**
++ * NVSwitch: connected NVLink count
++ */
++#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT   147  //!< Number of NVLinks connected to NVSwitch
++
++/* NvLink ECC Data Error Counters
++ *
++ * Lane ID needs to be specified in the scopeId field in nvmlFieldValue_t.
++ *
++ */
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0    148 //!< NVLink data ECC Error Counter for Link 0
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1    149 //!< NVLink data ECC Error Counter for Link 1
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2    150 //!< NVLink data ECC Error Counter for Link 2
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3    151 //!< NVLink data ECC Error Counter for Link 3
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4    152 //!< NVLink data ECC Error Counter for Link 4
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5    153 //!< NVLink data ECC Error Counter for Link 5
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6    154 //!< NVLink data ECC Error Counter for Link 6
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7    155 //!< NVLink data ECC Error Counter for Link 7
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8    156 //!< NVLink data ECC Error Counter for Link 8
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9    157 //!< NVLink data ECC Error Counter for Link 9
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10   158 //!< NVLink data ECC Error Counter for Link 10
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11   159 //!< NVLink data ECC Error Counter for Link 11
++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NVLink data ECC Error Counter total for all Links
++
++#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY            161 //!< NVLink Replay Error Counter
++#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY          162 //!< NVLink Recovery Error Counter
++#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC               163 //!< NVLink CRC Error Counter
++#define NVML_FI_DEV_NVLINK_GET_SPEED                  164 //!< NVLink Speed in MBps
++#define NVML_FI_DEV_NVLINK_GET_STATE                  165 //!< NVLink State - Active,Inactive
++#define NVML_FI_DEV_NVLINK_GET_VERSION                166 //!< NVLink Version
++
++#define NVML_FI_DEV_NVLINK_GET_POWER_STATE            167 //!< NVLink Power state. 0=HIGH_SPEED 1=LOW_SPEED
++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD        168 //!< NVLink length of idle period (units can be found from
++                                                          //   NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS) before
++                                                          //   transitioning links to sleep state
++
++#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER       169 //!< Device PEX error recovery counter
++
++#define NVML_FI_DEV_C2C_LINK_COUNT                    170 //!< Number of C2C Links present on the device
++#define NVML_FI_DEV_C2C_LINK_GET_STATUS               171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
++#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW               172 //!< C2C Link Speed in MBps for active links
++
++#define NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS     173 //!< PCIe Correctable Errors Counter
++#define NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED          174 //!< PCIe NAK Receive Counter
++#define NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR         175 //!< PCIe Receiver Error Counter
++#define NVML_FI_DEV_PCIE_COUNT_BAD_TLP                176 //!< PCIe Bad TLP Counter
++#define NVML_FI_DEV_PCIE_COUNT_NAKS_SENT              177 //!< PCIe NAK Send Counter
++#define NVML_FI_DEV_PCIE_COUNT_BAD_DLLP               178 //!< PCIe Bad DLLP Counter
++#define NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR        179 //!< PCIe Non Fatal Error Counter
++#define NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR            180 //!< PCIe Fatal Error Counter
++#define NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ        181 //!< PCIe Unsupported Request Counter
++#define NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR             182 //!< PCIe LCRC Error Counter
++#define NVML_FI_DEV_PCIE_COUNT_LANE_ERROR             183 //!< PCIe Per Lane Error Counter.
++
++#define NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED        184 //!< Device's Restless MIG Capability
++
++/**
++ * Retrieves power usage for this GPU in milliwatts.
++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and
++ * \ref nvmlDeviceGetPowerUsage.
++ *
++ * scopeId needs to be specified. It signifies:
++ * 0 - GPU Only Scope - Metrics for GPU are retrieved
++ * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved.
++ * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported
++ */
++#define NVML_FI_DEV_POWER_AVERAGE                     185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures.
++#define NVML_FI_DEV_POWER_INSTANT                     186 //!< Current GPU power, supported on all architectures.
++#define NVML_FI_DEV_POWER_MIN_LIMIT                   187 //!< Minimum power limit in milliwatts.
++#define NVML_FI_DEV_POWER_MAX_LIMIT                   188 //!< Maximum power limit in milliwatts.
++#define NVML_FI_DEV_POWER_DEFAULT_LIMIT               189 //!< Default power limit in milliwatts (limit which device boots with).
++#define NVML_FI_DEV_POWER_CURRENT_LIMIT               190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band).
++#define NVML_FI_DEV_ENERGY                            191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU.
++#define NVML_FI_DEV_POWER_REQUESTED_LIMIT             192 //!< Power limit requested by NVML or any other userspace client.
++
++/**
++ * GPU T.Limit temperature thresholds in degree Celsius
++ *
++ * These fields are supported on Ada and later architectures and supersedes \ref nvmlDeviceGetTemperatureThreshold.
++ */
++#define NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT       193 //!< T.Limit temperature after which GPU may shut down for HW protection
++#define NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT       194 //!< T.Limit temperature after which GPU may begin HW slowdown
++#define NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT        195 //!< T.Limit temperature after which GPU may begin SW slowdown due to memory temperature
++#define NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT        196 //!< T.Limit temperature after which GPU may be throttled below base clock
++
++#define NVML_FI_DEV_PCIE_COUNT_TX_BYTES               197 //!< PCIe transmit bytes. Value can be wrapped.
++#define NVML_FI_DEV_PCIE_COUNT_RX_BYTES               198 //!< PCIe receive bytes. Value can be wrapped.
++
++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX    199 //!< Max Nvlink Power Threshold. See NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD
++
++#define NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE   200 //!< MIG mode independent, MIG query capable device. 1=yes. 0=no.
++
++#define NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS                    201 //!<Total Tx packets on the link in NVLink5
++#define NVML_FI_DEV_NVLINK_COUNT_XMIT_BYTES                      202 //!<Total Tx bytes on the link in NVLink5
++#define NVML_FI_DEV_NVLINK_COUNT_RCV_PACKETS                     203 //!<Total Rx packets on the link in NVLink5
++#define NVML_FI_DEV_NVLINK_COUNT_RCV_BYTES                       204 //!<Total Rx bytes on the link in NVLink5
++#define NVML_FI_DEV_NVLINK_COUNT_VL15_DROPPED                    205 //!<Number of VL15 MADs dropped on a link in NVLink5
++#define NVML_FI_DEV_NVLINK_COUNT_MALFORMED_PACKET_ERRORS         206 //!<Number of packets Rx on a link where packets are malformed
++#define NVML_FI_DEV_NVLINK_COUNT_BUFFER_OVERRUN_ERRORS           207 //!<Number of packets that were discarded on Rx due to buffer overrun
++#define NVML_FI_DEV_NVLINK_COUNT_RCV_ERRORS                      208 //!<Total number of packets with errors Rx on a link
++#define NVML_FI_DEV_NVLINK_COUNT_RCV_REMOTE_ERRORS               209 //!<Total number of packets Rx - stomp/EBP marker
++#define NVML_FI_DEV_NVLINK_COUNT_RCV_GENERAL_ERRORS              210 //!<Total number of packets Rx with header mismatch
++#define NVML_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS     211 //!<Total number of times that the count of local errors exceeded a threshold
++#define NVML_FI_DEV_NVLINK_COUNT_XMIT_DISCARDS                   212 //!<Total number of tx error packets that were discarded
++
++#define NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS 213 //!<Number of times link went from Up to recovery, succeeded and link came back up
++#define NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS     214 //!<Number of times link went from Up to recovery, failed and link was declared down
++#define NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS            215 //!<Number of times link went from Up to recovery, irrespective of the result
++
++#define NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE0                   216 //!<BER per lane for lane 0
++#define NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE1                   217 //!<BER per lane for lane 1
++#define NVML_FI_DEV_NVLINK_COUNT_RAW_BER                         218 //!<BER per link. Sum of all the raw errors per lane/Bits received per link
++#define NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS                219 //!<Sum of the number of errors in each Nvlink packet
++#define NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER                   220 //!<Effective BER for effective errors
++#define NVML_FI_DEV_NVLINK_COUNT_SYMBOL_ERRORS                   221 //!<Number of errors in rx symbols
++#define NVML_FI_DEV_NVLINK_COUNT_SYMBOL_BER                      222 //!<BER for symbol errors
++
++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MIN               223 //!< Min Nvlink Power Threshold. See NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD
++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS             224 //!< Values are in the form NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_*
++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_SUPPORTED         225 //!< Determine if Nvlink Power Threshold feature is supported
++
++#define NVML_FI_DEV_RESET_STATUS                                 226 //!< GPU reset status
++#define NVML_FI_DEV_DRAIN_AND_RESET_STATUS                       227 //!< GPU drain and reset status
++#define NVML_FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK                   228
++#define NVML_FI_DEV_PCIE_INBOUND_ATOMICS_MASK                    229
++
++#define NVML_FI_MAX                                              230 //!< One greater than the largest field ID defined above
++
++/**
++ * NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
++ */
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US 0x0
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_50US  0x1
++
++#define NVML_NVLINK_POWER_STATE_HIGH_SPEED    0x0
++#define NVML_NVLINK_POWER_STATE_LOW           0x1
++
++/*
++ * NVML_NVLINK_LOW_POWER_THRESHOLD_MIN will be deprecated.
++ * Use the NVML Field Value NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MIN
++ * to get the correct Min Low Power Threshold.
++ */
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_MIN   0x1
++
++/*
++ * NVML_NVLINK_LOW_POWER_THRESHOLD_MAX will be deprecated.
++ * Use the NVML Field Value NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX
++ * to get the correct Max Low Power Threshold.
++ */
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_MAX   0x1FFF
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_RESET 0xFFFFFFFF
++
++#define NVML_NVLINK_LOW_POWER_THRESHOLD_DEFAULT NVML_NVLINK_LOW_POWER_THRESHOLD_RESET
++
++/* Structure containing Low Power parameters */
++typedef struct nvmlNvLinkPowerThres_st
++{
++    unsigned int lowPwrThreshold;           //!< Low power threshold
++                                            //   Units can be obtained from
++                                            //   NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
++} nvmlNvLinkPowerThres_t;
++
++/**
++ * Information for a Field Value Sample
++ */
++typedef struct nvmlFieldValue_st
++{
++    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
++    unsigned int scopeId;       //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
++    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
++    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
++    nvmlValueType_t valueType;  //!< Type of the value stored in value
++    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
++    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
++} nvmlFieldValue_t;
++
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlUnitStructs Unit Structs
++ *  @{
++ */
++/***************************************************************************************************/
++
++typedef struct nvmlUnit_st* nvmlUnit_t;
++
++/**
++ * Description of HWBC entry
++ */
++typedef struct nvmlHwbcEntry_st
++{
++    unsigned int hwbcId;
++    char firmwareVersion[32];
++} nvmlHwbcEntry_t;
++
++/**
++ * Fan state enum.
++ */
++typedef enum nvmlFanState_enum
++{
++    NVML_FAN_NORMAL       = 0,     //!< Fan is working properly
++    NVML_FAN_FAILED       = 1      //!< Fan has failed
++} nvmlFanState_t;
++
++/**
++ * Led color enum.
++ */
++typedef enum nvmlLedColor_enum
++{
++    NVML_LED_COLOR_GREEN       = 0,     //!< GREEN, indicates good health
++    NVML_LED_COLOR_AMBER       = 1      //!< AMBER, indicates problem
++} nvmlLedColor_t;
++
++
++/**
++ * LED states for an S-class unit.
++ */
++typedef struct nvmlLedState_st
++{
++    char cause[256];               //!< If amber, a text description of the cause
++    nvmlLedColor_t color;          //!< GREEN or AMBER
++} nvmlLedState_t;
++
++/**
++ * Static S-class unit info.
++ */
++typedef struct nvmlUnitInfo_st
++{
++    char name[96];                      //!< Product name
++    char id[96];                        //!< Product identifier
++    char serial[96];                    //!< Product serial number
++    char firmwareVersion[96];           //!< Firmware version
++} nvmlUnitInfo_t;
++
++/**
++ * Power usage information for an S-class unit.
++ * The power supply state is a human readable string that equals "Normal" or contains
++ * a combination of "Abnormal" plus one or more of the following:
++ *
++ *    - High voltage
++ *    - Fan failure
++ *    - Heatsink temperature
++ *    - Current limit
++ *    - Voltage below UV alarm threshold
++ *    - Low-voltage
++ *    - SI2C remote off command
++ *    - MOD_DISABLE input
++ *    - Short pin transition
++*/
++typedef struct nvmlPSUInfo_st
++{
++    char state[256];                 //!< The power supply state
++    unsigned int current;            //!< PSU current (A)
++    unsigned int voltage;            //!< PSU voltage (V)
++    unsigned int power;              //!< PSU power draw (W)
++} nvmlPSUInfo_t;
++
++/**
++ * Fan speed reading for a single fan in an S-class unit.
++ */
++typedef struct nvmlUnitFanInfo_st
++{
++    unsigned int speed;              //!< Fan speed (RPM)
++    nvmlFanState_t state;            //!< Flag that indicates whether fan is working properly
++} nvmlUnitFanInfo_t;
++
++/**
++ * Fan speed readings for an entire S-class unit.
++ */
++typedef struct nvmlUnitFanSpeeds_st
++{
++    nvmlUnitFanInfo_t fans[24];      //!< Fan speed data for each fan
++    unsigned int count;              //!< Number of fans in unit
++} nvmlUnitFanSpeeds_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @addtogroup nvmlEvents
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Handle to an event set
++ */
++typedef struct nvmlEventSet_st* nvmlEventSet_t;
++
++/** @defgroup nvmlEventType Event Types
++ * @{
++ * Event Types which user can be notified about.
++ * See description of particular functions for details.
++ *
++ * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices
++ * support each event.
++ *
++ * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents
++ */
++//! Mask with no events
++#define nvmlEventTypeNone                       0x0000000000000000LL
++
++//! Event about single bit ECC errors
++/**
++ * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event
++ */
++#define nvmlEventTypeSingleBitEccError          0x0000000000000001LL
++
++//! Event about double bit ECC errors
++/**
++ * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event
++ */
++#define nvmlEventTypeDoubleBitEccError          0x0000000000000002LL
++
++//! Event about PState changes
++/**
++ *  \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to
++ *  no work being executed on the GPU, power capping or thermal capping. In a typical situation,
++ *  Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
++ */
++#define nvmlEventTypePState                     0x0000000000000004LL
++
++//! Event that Xid critical error occurred
++#define nvmlEventTypeXidCriticalError           0x0000000000000008LL
++
++//! Event about clock changes
++/**
++ * Kepler only
++ */
++#define nvmlEventTypeClock                      0x0000000000000010LL
++
++//! Event about AC/Battery power source changes
++#define nvmlEventTypePowerSourceChange          0x0000000000000080LL
++
++//! Event about MIG configuration changes
++#define nvmlEventMigConfigChange                0x0000000000000100LL
++
++//! Event about single bit ECC error storm
++#define nvmlEventTypeSingleBitEccErrorStorm     0x0000000000000200LL
++
++//! Event about DRAM retirement event
++#define nvmlEventTypeDramRetirementEvent        0x0000000000000400LL
++
++//! Event about DRAM retirement failure
++#define nvmlEventTypeDramRetirementFailure      0x0000000000000800LL
++
++//! Event for Non Fatal Poison
++#define nvmlEventTypeNonFatalPoisonError        0x0000000000001000LL
++
++//! Event for Fatal Poison
++#define nvmlEventTypeFatalPoisonError           0x0000000000002000LL
++
++//! Mask of all events
++#define nvmlEventTypeAll (nvmlEventTypeNone    \
++        | nvmlEventTypeSingleBitEccError       \
++        | nvmlEventTypeDoubleBitEccError       \
++        | nvmlEventTypePState                  \
++        | nvmlEventTypeClock                   \
++        | nvmlEventTypeXidCriticalError        \
++        | nvmlEventTypePowerSourceChange       \
++        | nvmlEventMigConfigChange             \
++        | nvmlEventTypeSingleBitEccErrorStorm  \
++        | nvmlEventTypeDramRetirementEvent     \
++        | nvmlEventTypeDramRetirementFailure   \
++        | nvmlEventTypeNonFatalPoisonError     \
++        | nvmlEventTypeFatalPoisonError        \
++        )
++/** @} */
++
++/**
++ * Information about occurred event
++ */
++typedef struct nvmlEventData_st
++{
++    nvmlDevice_t        device;             //!< Specific device where the event occurred
++    unsigned long long  eventType;          //!< Information about what specific event occurred
++    unsigned long long  eventData;          //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError,
++                                            //   eventData is 0 for any other event. eventData is set as 999 for unknown xid error.
++    unsigned int        gpuInstanceId;      //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU
++                                            //   instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF
++                                            //   otherwise.
++    unsigned int        computeInstanceId;  //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a
++                                            //   compute instance, stores a valid compute instance ID. computeInstanceId is set to
++                                            //   0xFFFFFFFF otherwise.
++} nvmlEventData_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @addtogroup nvmlClocksEventReasons
++ *  @{
++ */
++/***************************************************************************************************/
++
++/** Nothing is running on the GPU and the clocks are dropping to Idle state
++ * \note This limiter may be removed in a later release
++ */
++#define nvmlClocksEventReasonGpuIdle                   0x0000000000000001LL
++
++/** GPU clocks are limited by current setting of applications clocks
++ *
++ * @see nvmlDeviceSetApplicationsClocks
++ * @see nvmlDeviceGetApplicationsClock
++ */
++#define nvmlClocksEventReasonApplicationsClocksSetting 0x0000000000000002LL
++
++/**
++ * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting
++ *             as the name describes the situation more accurately.
++ */
++#define nvmlClocksThrottleReasonUserDefinedClocks         nvmlClocksEventReasonApplicationsClocksSetting
++
++/** The clocks have been optimized to ensure not to exceed currently set power limits
++ *
++ * @see nvmlDeviceGetPowerUsage
++ * @see nvmlDeviceSetPowerManagementLimit
++ * @see nvmlDeviceGetPowerManagementLimit
++ */
++#define nvmlClocksEventReasonSwPowerCap                0x0000000000000004LL
++
++/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
++ *
++ * This is an indicator of:
++ *   - temperature being too high
++ *   - External Power Brake Assertion is triggered (e.g. by the system power supply)
++ *   - Power draw is too high and Fast Trigger protection is reducing the clocks
++ *   - May be also reported during PState or clock change
++ *      - This behavior may be removed in a later release.
++ *
++ * @see nvmlDeviceGetTemperature
++ * @see nvmlDeviceGetTemperatureThreshold
++ * @see nvmlDeviceGetPowerUsage
++ */
++#define nvmlClocksThrottleReasonHwSlowdown                0x0000000000000008LL
++
++/** Sync Boost
++ *
++ * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
++ * order to maximize performance per watt. All GPUs in the sync boost group
++ * will boost to the minimum possible clocks across the entire group. Look at
++ * the throttle reasons for other GPUs in the system to see why those GPUs are
++ * holding this one at lower clocks.
++ *
++ */
++#define nvmlClocksEventReasonSyncBoost                 0x0000000000000010LL
++
++/** SW Thermal Slowdown
++ *
++ * The current clocks have been optimized to ensure the the following is true:
++ *  - Current GPU temperature does not exceed GPU Max Operating Temperature
++ *  - Current memory temperature does not exceeed Memory Max Operating Temperature
++ *
++ */
++#define nvmlClocksEventReasonSwThermalSlowdown         0x0000000000000020LL
++
++/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
++ *
++ * This is an indicator of:
++ *   - temperature being too high
++ *
++ * @see nvmlDeviceGetTemperature
++ * @see nvmlDeviceGetTemperatureThreshold
++ * @see nvmlDeviceGetPowerUsage
++ */
++#define nvmlClocksThrottleReasonHwThermalSlowdown         0x0000000000000040LL
++
++/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
++ *
++ * This is an indicator of:
++ *   - External Power Brake Assertion being triggered (e.g. by the system power supply)
++ *
++ * @see nvmlDeviceGetTemperature
++ * @see nvmlDeviceGetTemperatureThreshold
++ * @see nvmlDeviceGetPowerUsage
++ */
++#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown      0x0000000000000080LL
++
++/** GPU clocks are limited by current setting of Display clocks
++ *
++ * @see bug 1997531
++ */
++#define nvmlClocksEventReasonDisplayClockSetting       0x0000000000000100LL
++
++/** Bit mask representing no clocks throttling
++ *
++ * Clocks are as high as possible.
++ * */
++#define nvmlClocksEventReasonNone                      0x0000000000000000LL
++
++/** Bit mask representing all supported clocks throttling reasons
++ * New reasons might be added to this list in the future
++ */
++#define nvmlClocksEventReasonAll (nvmlClocksThrottleReasonNone \
++      | nvmlClocksEventReasonGpuIdle                           \
++      | nvmlClocksEventReasonApplicationsClocksSetting         \
++      | nvmlClocksEventReasonSwPowerCap                        \
++      | nvmlClocksThrottleReasonHwSlowdown                        \
++      | nvmlClocksEventReasonSyncBoost                         \
++      | nvmlClocksEventReasonSwThermalSlowdown                 \
++      | nvmlClocksThrottleReasonHwThermalSlowdown                 \
++      | nvmlClocksThrottleReasonHwPowerBrakeSlowdown              \
++      | nvmlClocksEventReasonDisplayClockSetting               \
++)
++
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonGpuIdle instead
++ */
++#define nvmlClocksThrottleReasonGpuIdle                      nvmlClocksEventReasonGpuIdle
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonApplicationsClocksSetting instead
++ */
++#define nvmlClocksThrottleReasonApplicationsClocksSetting    nvmlClocksEventReasonApplicationsClocksSetting
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonSyncBoost instead
++ */
++#define nvmlClocksThrottleReasonSyncBoost                    nvmlClocksEventReasonSyncBoost
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonSwPowerCap instead
++ */
++#define nvmlClocksThrottleReasonSwPowerCap                   nvmlClocksEventReasonSwPowerCap
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonSwThermalSlowdown instead
++ */
++#define nvmlClocksThrottleReasonSwThermalSlowdown            nvmlClocksEventReasonSwThermalSlowdown
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonDisplayClockSetting instead
++ */
++#define nvmlClocksThrottleReasonDisplayClockSetting          nvmlClocksEventReasonDisplayClockSetting
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonNone instead
++ */
++#define nvmlClocksThrottleReasonNone                         nvmlClocksEventReasonNone
++/**
++ * @deprecated Use \ref nvmlClocksEventReasonAll instead
++ */
++#define nvmlClocksThrottleReasonAll                          nvmlClocksEventReasonAll
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlAccountingStats Accounting Statistics
++ *  @{
++ *
++ *  Set of APIs designed to provide per process information about usage of GPU.
++ *
++ *  @note All accounting statistics and accounting mode live in nvidia driver and reset
++ *        to default (Disabled) when driver unloads.
++ *        It is advised to run with persistence mode enabled.
++ *
++ *  @note Enabling accounting mode has no negative impact on the GPU performance.
++ */
++/***************************************************************************************************/
++
++/**
++ * Describes accounting statistics of a process.
++ */
++typedef struct nvmlAccountingStats_st {
++    unsigned int gpuUtilization;                //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
++                                                //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
++                                                //! process (not just the last sample period).
++                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
++
++    unsigned int memoryUtilization;             //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
++                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
++
++    unsigned long long maxMemoryUsage;          //!< Maximum total memory in bytes that was ever allocated by the process.
++                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported
++
++
++    unsigned long long time;                    //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if
++                                                //!< the process is not terminated
++
++    unsigned long long startTime;               //!< CPU Timestamp in usec representing start time for the process
++
++    unsigned int isRunning;                     //!< Flag to represent if the process is running (1 for running, 0 for terminated)
++
++    unsigned int reserved[5];                   //!< Reserved for future use
++} nvmlAccountingStats_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlEncoderStructs Encoder Structs
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Represents type of encoder for capacity can be queried
++ */
++typedef enum nvmlEncoderQueryType_enum
++{
++    NVML_ENCODER_QUERY_H264     = 0x00,        //!< H264 encoder
++    NVML_ENCODER_QUERY_HEVC     = 0x01,        //!< HEVC encoder
++    NVML_ENCODER_QUERY_AV1      = 0x02,        //!< AV1 encoder
++    NVML_ENCODER_QUERY_UNKNOWN  = 0xFF         //!< Unknown encoder
++}nvmlEncoderType_t;
++
++/**
++ * Structure to hold encoder session data
++ */
++typedef struct nvmlEncoderSessionInfo_st
++{
++    unsigned int       sessionId;       //!< Unique session ID
++    unsigned int       pid;             //!< Owning process ID
++    nvmlVgpuInstance_t vgpuInstance;    //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
++    nvmlEncoderType_t  codecType;       //!< Video encoder type
++    unsigned int       hResolution;     //!< Current encode horizontal resolution
++    unsigned int       vResolution;     //!< Current encode vertical resolution
++    unsigned int       averageFps;      //!< Moving average encode frames per second
++    unsigned int       averageLatency;  //!< Moving average encode latency in microseconds
++}nvmlEncoderSessionInfo_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures
++*  @{
++*/
++/***************************************************************************************************/
++
++/**
++ * Represents frame buffer capture session type
++ */
++typedef enum nvmlFBCSessionType_enum
++{
++    NVML_FBC_SESSION_TYPE_UNKNOWN = 0,     //!< Unknown
++    NVML_FBC_SESSION_TYPE_TOSYS,           //!< ToSys
++    NVML_FBC_SESSION_TYPE_CUDA,            //!< Cuda
++    NVML_FBC_SESSION_TYPE_VID,             //!< Vid
++    NVML_FBC_SESSION_TYPE_HWENC            //!< HEnc
++} nvmlFBCSessionType_t;
++
++/**
++ * Structure to hold frame buffer capture sessions stats
++ */
++typedef struct nvmlFBCStats_st
++{
++    unsigned int      sessionsCount;    //!< Total no of sessions
++    unsigned int      averageFPS;       //!< Moving average new frames captured per second
++    unsigned int      averageLatency;   //!< Moving average new frame capture latency in microseconds
++} nvmlFBCStats_t;
++
++#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED                0x00000001    //!< Bit specifying differential map state.
++#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED      0x00000002    //!< Bit specifying classification map state.
++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT      0x00000004    //!< Bit specifying if capture was requested as non-blocking call.
++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE     0x00000008    //!< Bit specifying if capture was requested as blocking call.
++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT      0x00000010    //!< Bit specifying if capture was requested as blocking call with timeout period.
++
++/**
++ * Structure to hold FBC session data
++ */
++typedef struct nvmlFBCSessionInfo_st
++{
++    unsigned int          sessionId;                           //!< Unique session ID
++    unsigned int          pid;                                 //!< Owning process ID
++    nvmlVgpuInstance_t    vgpuInstance;                        //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
++    unsigned int          displayOrdinal;                      //!< Display identifier
++    nvmlFBCSessionType_t  sessionType;                         //!< Type of frame buffer capture session
++    unsigned int          sessionFlags;                        //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX).
++    unsigned int          hMaxResolution;                      //!< Max horizontal resolution supported by the capture session
++    unsigned int          vMaxResolution;                      //!< Max vertical resolution supported by the capture session
++    unsigned int          hResolution;                         //!< Horizontal resolution requested by caller in capture call
++    unsigned int          vResolution;                         //!< Vertical resolution requested by caller in capture call
++    unsigned int          averageFPS;                          //!< Moving average new frames captured per second
++    unsigned int          averageLatency;                      //!< Moving average new frame capture latency in microseconds
++} nvmlFBCSessionInfo_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlDrainDefs Drain State definitions
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ *  Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu()
++ */
++typedef enum nvmlDetachGpuState_enum
++{
++    NVML_DETACH_GPU_KEEP         = 0,
++    NVML_DETACH_GPU_REMOVE
++} nvmlDetachGpuState_t;
++
++/**
++ *  Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu()
++ */
++typedef enum nvmlPcieLinkState_enum
++{
++    NVML_PCIE_LINK_KEEP         = 0,
++    NVML_PCIE_LINK_SHUT_DOWN
++} nvmlPcieLinkState_t;
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlConfidentialComputingDefs Confidential Computing definitions
++ *  @{
++ */
++/***************************************************************************************************/
++/**
++ * Confidential Compute CPU Capabilities values
++ */
++#define NVML_CC_SYSTEM_CPU_CAPS_NONE      0
++#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV   1
++#define NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX 2
++
++/**
++ * Confidenial Compute GPU Capabilities values
++ */
++#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0
++#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE     1
++
++typedef struct nvmlConfComputeSystemCaps_st {
++    unsigned int cpuCaps;
++    unsigned int gpusCaps;
++} nvmlConfComputeSystemCaps_t;
++
++/**
++ * Confidential Compute DevTools Mode values
++ */
++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF 0
++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_ON  1
++
++/**
++ * Confidential Compute Environment values
++ */
++#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0
++#define NVML_CC_SYSTEM_ENVIRONMENT_SIM         1
++#define NVML_CC_SYSTEM_ENVIRONMENT_PROD        2
++
++/**
++ * Confidential Compute Feature Status values
++ */
++#define NVML_CC_SYSTEM_FEATURE_DISABLED 0
++#define NVML_CC_SYSTEM_FEATURE_ENABLED  1
++
++typedef struct nvmlConfComputeSystemState_st {
++    unsigned int environment;
++    unsigned int ccFeature;
++    unsigned int devToolsMode;
++} nvmlConfComputeSystemState_t;
++
++/**
++ * Confidential Compute Multigpu mode values
++ */
++#define NVML_CC_SYSTEM_MULTIGPU_NONE 0
++#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
++
++/**
++ * Confidential Compute System settings
++ */
++typedef struct {
++    unsigned int version;
++    unsigned int environment;
++    unsigned int ccFeature;
++    unsigned int devToolsMode;
++    unsigned int multiGpuMode;
++} nvmlSystemConfComputeSettings_v1_t;
++
++typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
++#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1)
++
++/**
++ * Protected memory size
++ */
++typedef struct
++nvmlConfComputeMemSizeInfo_st
++{
++    unsigned long long protectedMemSizeKib;
++    unsigned long long unprotectedMemSizeKib;
++} nvmlConfComputeMemSizeInfo_t;
++
++/**
++ * Confidential Compute GPUs/System Ready State values
++ */
++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0
++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE  1
++
++/**
++ * GPU Certificate Details
++ */
++#define NVML_GPU_CERT_CHAIN_SIZE 0x1000
++#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1400
++
++typedef struct nvmlConfComputeGpuCertificate_st {
++    unsigned int certChainSize;
++    unsigned int attestationCertChainSize;
++    unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE];
++    unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE];
++} nvmlConfComputeGpuCertificate_t;
++
++/**
++ * GPU Attestation Report
++ */
++#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20
++#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000
++#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000
++#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0
++#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1
++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN 50
++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX 75
++
++typedef struct nvmlConfComputeGpuAttestationReport_st {
++    unsigned int isCecAttestationReportPresent;
++    unsigned int attestationReportSize;
++    unsigned int cecAttestationReportSize;
++    unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE];
++    unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE];
++    unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE];
++} nvmlConfComputeGpuAttestationReport_t;
++
++typedef struct nvmlConfComputeSetKeyRotationThresholdInfo_st {
++    unsigned int version;
++    unsigned long long maxAttackerAdvantage;
++} nvmlConfComputeSetKeyRotationThresholdInfo_v1_t;
++
++typedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t nvmlConfComputeSetKeyRotationThresholdInfo_t;
++#define nvmlConfComputeSetKeyRotationThresholdInfo_v1 \
++        NVML_STRUCT_VERSION(ConfComputeSetKeyRotationThresholdInfo, 1)
++
++typedef struct nvmlConfComputeGetKeyRotationThresholdInfo_st {
++    unsigned int version;
++    unsigned long long attackerAdvantage;
++} nvmlConfComputeGetKeyRotationThresholdInfo_v1_t;
++
++typedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRotationThresholdInfo_t;
++#define nvmlConfComputeGetKeyRotationThresholdInfo_v1 \
++        NVML_STRUCT_VERSION(ConfComputeGetKeyRotationThresholdInfo, 1)
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlFabricDefs Fabric definitions
++ *  @{
++ */
++/***************************************************************************************************/
++
++#define NVML_GPU_FABRIC_UUID_LEN 16
++
++#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0
++#define NVML_GPU_FABRIC_STATE_NOT_STARTED   1
++#define NVML_GPU_FABRIC_STATE_IN_PROGRESS   2
++#define NVML_GPU_FABRIC_STATE_COMPLETED     3
++
++typedef unsigned char nvmlGpuFabricState_t;
++
++/**
++ * Contains the device fabric information
++ */
++typedef struct {
++    unsigned char        clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
++    nvmlReturn_t         status;                                //!< Error status, if any. Must be checked only if state returns "complete".
++    unsigned int         cliqueId;                              //!< ID of the fabric clique to which this GPU belongs
++    nvmlGpuFabricState_t state;                                 //!< Current state of GPU registration process
++} nvmlGpuFabricInfo_t;
++
++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0
++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE          1
++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE         2
++
++#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0
++#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11
++
++/**
++ * GPU Fabric Health Status Mask for various fields can be obtained
++ * using the below macro.
++ * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW)
++ */
++#define NVML_GPU_FABRIC_HEALTH_GET(var, type)             \
++    (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \
++     (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type))
++
++/**
++ * GPU Fabric Health Status Mask for various fields can be tested
++ * using the below macro.
++ * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE)
++ */
++#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \
++    (NVML_GPU_FABRIC_HEALTH_GET(var, type) ==       \
++     NVML_GPU_FABRIC_HEALTH_MASK##type##val)
++
++/**
++* GPU Fabric information (v2).
++*
++* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field
++* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask
++* field to the end. This structure is not backwards-compatible with
++* \ref nvmlGpuFabricInfo_t.
++*/
++typedef struct {
++    unsigned int         version;                               //!< Structure version identifier (set to \p nvmlGpuFabricInfo_v2)
++    unsigned char        clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
++    nvmlReturn_t         status;                                //!< Error status, if any. Must be checked only if state returns "complete".
++    unsigned int         cliqueId;                              //!< ID of the fabric clique to which this GPU belongs
++    nvmlGpuFabricState_t state;                                 //!< Current state of GPU registration process
++    unsigned int         healthMask;                            //!< GPU Fabric health Status Mask
++} nvmlGpuFabricInfo_v2_t;
++
++typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
++
++/**
++* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version.
++*/
++#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
++
++/**
++ * Device Scope - This is useful to retrieve the telemetry at GPU and module (e.g. GPU + CPU) level
++ */
++#define NVML_POWER_SCOPE_GPU     0U    //!< Targets only GPU
++#define NVML_POWER_SCOPE_MODULE  1U    //!< Targets the whole module
++#define NVML_POWER_SCOPE_MEMORY  2U    //!< Targets the GPU Memory
++
++typedef unsigned char nvmlPowerScopeType_t;
++
++/**
++ * Contains the power management limit
++ */
++typedef struct
++{
++    unsigned int         version;       //!< Structure format version (must be 1)
++    nvmlPowerScopeType_t powerScope;    //!< [in]  Device type: GPU or Total Module
++    unsigned int         powerValueMw;  //!< [out] Power value to retrieve or set in milliwatts
++} nvmlPowerValue_v2_t;
++
++#define nvmlPowerValue_v2 NVML_STRUCT_VERSION(PowerValue, 2)
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup
++ * This chapter describes the methods that handle NVML initialization and cleanup.
++ * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and
++ * nvmlShutdown() once NVML is no longer being used.
++ *  @{
++ */
++/***************************************************************************************************/
++
++#define NVML_INIT_FLAG_NO_GPUS      1   //!< Don't fail nvmlInit() when no GPUs are found
++#define NVML_INIT_FLAG_NO_ATTACH    2   //!< Don't attach GPUs
++
++/**
++ * Initialize NVML, but don't initialize any GPUs yet.
++ *
++ * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values
++ *       modifying the behaviour of nvmlInit().
++ * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that
++ *       did initialize all GPU devices in the system.
++ *
++ * This allows NVML to communicate with a GPU
++ * when other GPUs in the system are unstable or in a bad state.  When using this API, GPUs are
++ * discovered and initialized in nvmlDeviceGetHandleBy* functions instead.
++ *
++ * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in
++ *       a bad or unstable state.
++ *
++ * For all products.
++ *
++ * This method, should be called once before invoking any other methods in the library.
++ * A reference count of the number of initializations is maintained.  Shutdown only occurs
++ * when the reference count reaches zero.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
++ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
++ *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlInit_v2(void);
++
++/**
++ * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values
++ *       modifying the behaviour of nvmlInit().
++ *       Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2.
++ *
++ * For all products.
++ *
++ * @param flags                                 behaviour modifier flags
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
++ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
++ *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags);
++
++/**
++ * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2().
++ *
++ * For all products.
++ *
++ * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2()
++ * A reference count of the number of initializations is maintained.  Shutdown only occurs
++ * when the reference count reaches zero.  For backwards compatibility, no error is reported if
++ * nvmlShutdown() is called more times than nvmlInit().
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if NVML has been properly shut down
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlShutdown(void);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlErrorReporting Error reporting
++ * This chapter describes helper functions for error reporting routines.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Helper method for converting NVML error codes into readable strings.
++ *
++ * For all products.
++ *
++ * @param result                               NVML error code to convert
++ *
++ * @return String representation of the error.
++ *
++ */
++const DECLDIR char* nvmlErrorString(nvmlReturn_t result);
++/** @} */
++
++
++/***************************************************************************************************/
++/** @defgroup nvmlConstants Constants
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion
++ */
++#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE       16
++
++/**
++ * Buffer size guaranteed to be large enough for storing GPU identifiers.
++ */
++#define NVML_DEVICE_UUID_BUFFER_SIZE                  80
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
++ */
++#define NVML_DEVICE_UUID_V2_BUFFER_SIZE               96
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber
++ */
++#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE           80
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion
++ */
++#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE        80
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion
++ */
++#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE          80
++
++/**
++ * Buffer size guaranteed to be large enough for storing GPU device names.
++ */
++#define NVML_DEVICE_NAME_BUFFER_SIZE                  64
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
++ */
++#define NVML_DEVICE_NAME_V2_BUFFER_SIZE               96
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial
++ */
++#define NVML_DEVICE_SERIAL_BUFFER_SIZE                30
++
++/**
++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion
++ */
++#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE         32
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlSystemQueries System Queries
++ * This chapter describes the queries that NVML can perform against the local system. These queries
++ * are not device-specific.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Retrieves the version of the system's graphics driver.
++ *
++ * For all products.
++ *
++ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
++ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
++ *
++ * @param version                              Reference in which to return the version identifier
++ * @param length                               The maximum allowed length of the string returned in \a version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length);
++
++/**
++ * Retrieves the version of the NVML library.
++ *
++ * For all products.
++ *
++ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
++ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE.
++ *
++ * @param version                              Reference in which to return the version identifier
++ * @param length                               The maximum allowed length of the string returned in \a version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length);
++
++/**
++ * Retrieves the version of the CUDA driver.
++ *
++ * For all products.
++ *
++ * The CUDA driver version returned will be retreived from the currently installed version of CUDA.
++ * If the cuda library is not found, this function will return a known supported version number.
++ *
++ * @param cudaDriverVersion                    Reference in which to return the version identifier
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a cudaDriverVersion has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cudaDriverVersion is NULL
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion);
++
++/**
++ * Retrieves the version of the CUDA driver from the shared library.
++ *
++ * For all products.
++ *
++ * The returned CUDA driver version by calling cuDriverGetVersion()
++ *
++ * @param cudaDriverVersion                    Reference in which to return the version identifier
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a cudaDriverVersion has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a cudaDriverVersion is NULL
++ *         - \ref NVML_ERROR_LIBRARY_NOT_FOUND  if \a libcuda.so.1 or libcuda.dll is not found
++ *         - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion);
++
++/**
++ * Macros for converting the CUDA driver version number to Major and Minor version numbers.
++ */
++#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000)
++#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10)
++
++/**
++ * Gets name of the process with provided process id
++ *
++ * For all products.
++ *
++ * Returned process name is cropped to provided length.
++ * name string is encoded in ANSI.
++ *
++ * @param pid                                  The identifier of the process
++ * @param name                                 Reference in which to return the process name
++ * @param length                               The maximum allowed length of the string returned in \a name
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a name has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a name is NULL or \a length is 0.
++ *         - \ref NVML_ERROR_NOT_FOUND         if process doesn't exists
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length);
++
++/**
++ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
++ *
++ * For S-class products.
++ *
++ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
++ * The HIC must be connected to an S-class system for it to be reported by this function.
++ *
++ * @param hwbcCount                            Size of hwbcEntries array
++ * @param hwbcEntries                          Array holding information about hwbc
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a hwbcCount and \a hwbcEntries have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if either \a hwbcCount or \a hwbcEntries is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
++
++/**
++ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
++ * For all products.
++ * Supported on Linux only.
++ *
++ * @param cpuNumber                            The CPU number
++ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray
++ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
++ *                                             number of device handles.
++ * @param deviceArray                          An array of device handles for GPUs found with affinity to \a cpuNumber
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
++
++/**
++ * Structure to store Driver branch information
++ */
++typedef struct
++{
++    unsigned int version;                                           //!< The version number of this struct
++    char         branch[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];    //!< driver branch
++} nvmlSystemDriverBranchInfo_v1_t;
++typedef nvmlSystemDriverBranchInfo_v1_t nvmlSystemDriverBranchInfo_t;
++#define nvmlSystemDriverBranchInfo_v1 NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1)
++
++/**
++ * Retrieves the driver branch of the NVIDIA driver installed on the system.
++ *
++ * For all products.
++ *
++ * The branch identifier is an alphanumeric string.  It will not exceed 80 characters in length
++ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
++ *
++ * @param branchInfo                            Pointer to the driver branch information structure \a nvmlSystemDriverBranchInfo_t
++ * @param length                                The maximum allowed length of the driver branch string
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a branchInfo is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetDriverBranch(nvmlSystemDriverBranchInfo_t *branchInfo, unsigned int length);
++
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlUnitQueries Unit Queries
++ * This chapter describes that queries that NVML can perform against each unit. For S-class systems only.
++ * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by
++ * calling \ref nvmlUnitGetHandleByIndex().
++ *  @{
++ */
++/***************************************************************************************************/
++
++ /**
++ * Retrieves the number of units in the system.
++ *
++ * For S-class products.
++ *
++ * @param unitCount                            Reference in which to return the number of units
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a unitCount has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unitCount is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount);
++
++/**
++ * Acquire the handle for a particular unit, based on its index.
++ *
++ * For S-class products.
++ *
++ * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount().
++ *   For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
++ *
++ * The order in which NVML enumerates units has no guarantees of consistency between reboots.
++ *
++ * @param index                                The index of the target unit, >= 0 and < \a unitCount
++ * @param unit                                 Reference in which to return the unit handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a unit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a index is invalid or \a unit is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
++
++/**
++ * Retrieves the static information associated with a unit.
++ *
++ * For S-class products.
++ *
++ * See \ref nvmlUnitInfo_t for details on available unit info.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param info                                 Reference in which to return the unit information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a info has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a info is NULL
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
++
++/**
++ * Retrieves the LED state associated with this unit.
++ *
++ * For S-class products.
++ *
++ * See \ref nvmlLedState_t for details on allowed states.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param state                                Reference in which to return the current LED state
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a state has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a state is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlUnitSetLedState()
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
++
++/**
++ * Retrieves the PSU stats for the unit.
++ *
++ * For S-class products.
++ *
++ * See \ref nvmlPSUInfo_t for details on available PSU info.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param psu                                  Reference in which to return the PSU information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a psu has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a psu is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
++
++/**
++ * Retrieves the temperature readings for the unit, in degrees C.
++ *
++ * For S-class products.
++ *
++ * Depending on the product, readings may be available for intake (type=0),
++ * exhaust (type=1) and board (type=2).
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param type                                 The type of reading to take
++ * @param temp                                 Reference in which to return the intake temperature
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a temp has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a type is invalid or \a temp is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
++
++/**
++ * Retrieves the fan speed readings for the unit.
++ *
++ * For S-class products.
++ *
++ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param fanSpeeds                            Reference in which to return the fan speed information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a fanSpeeds has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a fanSpeeds is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
++
++/**
++ * Retrieves the set of GPU devices that are attached to the specified unit.
++ *
++ * For S-class products.
++ *
++ * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param deviceCount                          Reference in which to provide the \a devices array size, and
++ *                                             to return the number of attached GPU devices
++ * @param devices                              Reference in which to return the references to the attached GPU devices
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a deviceCount and \a devices have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid, either of \a deviceCount or \a devices is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlDeviceQueries Device Queries
++ * This chapter describes that queries that NVML can perform against each device.
++ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by
++ * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(),
++ * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID().
++ *  @{
++ */
++/***************************************************************************************************/
++
++ /**
++ * Retrieves the number of compute devices in the system. A compute device is a single GPU.
++ *
++ * For all products.
++ *
++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
++ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
++ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
++ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
++ *       library.
++ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
++ *
++ * @param deviceCount                          Reference in which to return the number of accessible devices
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount);
++
++/**
++ * Get attributes (engine counts etc.) for the given NVML device handle.
++ *
++ * @note This API currently only supports MIG device handles.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               NVML device handle
++ * @param attributes                           Device attributes
++ *
++ * @return
++ *        - \ref NVML_SUCCESS                  if \a device attributes were successfully retrieved
++ *        - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device handle is invalid
++ *        - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *        - \ref NVML_ERROR_NOT_SUPPORTED      if this query is not supported by the device
++ *        - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes);
++
++/**
++ * Acquire the handle for a particular device, based on its index.
++ *
++ * For all products.
++ *
++ * Valid indices are derived from the \a accessibleDevices count returned by
++ *   \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices
++ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
++ *
++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
++ *   is recommended that devices be looked up by their PCI ids or UUID. See
++ *   \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2().
++ *
++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
++ *
++ * Starting from NVML 5, this API causes NVML to initialize the target GPU
++ * NVML may initialize additional GPUs if:
++ *  - The target GPU is an SLI slave
++ *
++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
++ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
++ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
++ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
++ *       library.
++ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
++ *
++ *       This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
++ *       If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
++ *       need to worry about that.
++ *
++ * @param index                                The index of the target GPU, >= 0 and < \a accessibleDevices
++ * @param device                               Reference in which to return the device handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a device has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a device is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
++ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
++ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ *
++ * @see nvmlDeviceGetIndex
++ * @see nvmlDeviceGetCount
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device);
++
++/**
++ * Acquire the handle for a particular device, based on its board serial number.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * This number corresponds to the value printed directly on the board, and to the value returned by
++ *   \ref nvmlDeviceGetSerial().
++ *
++ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor
++ *             of \ref nvmlDeviceGetHandleByUUID.
++ *             For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
++ *
++ * Starting from NVML 5, this API causes NVML to initialize the target GPU
++ * NVML may initialize additional GPUs as it searches for the target GPU
++ *
++ * @param serial                               The board serial number of the target GPU
++ * @param device                               Reference in which to return the device handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a device has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a serial is invalid, \a device is NULL or more than one
++ *                                              device has the same serial (dual GPU boards)
++ *         - \ref NVML_ERROR_NOT_FOUND          if \a serial does not match a valid device on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
++ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ *
++ * @see nvmlDeviceGetSerial
++ * @see nvmlDeviceGetHandleByUUID
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
++
++/**
++ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
++ *
++ * For all products.
++ *
++ * @param uuid                                 The UUID of the target GPU or MIG instance
++ * @param device                               Reference in which to return the device handle or MIG device handle
++ *
++ * Starting from NVML 5, this API causes NVML to initialize the target GPU
++ * NVML may initialize additional GPUs as it searches for the target GPU
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a device has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a uuid is invalid or \a device is null
++ *         - \ref NVML_ERROR_NOT_FOUND          if \a uuid does not match a valid device on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
++ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ *
++ * @see nvmlDeviceGetUUID
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
++
++/**
++ * Acquire the handle for a particular device, based on its PCI bus id.
++ *
++ * For all products.
++ *
++ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3().
++ *
++ * Starting from NVML 5, this API causes NVML to initialize the target GPU
++ * NVML may initialize additional GPUs if:
++ *  - The target GPU is an SLI slave
++ *
++ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND
++ *       instead of NVML_ERROR_NO_PERMISSION.
++ *
++ * @param pciBusId                             The PCI bus id of the target GPU
++ * @param device                               Reference in which to return the device handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a device has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a pciBusId is invalid or \a device is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND          if \a pciBusId does not match a valid device on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
++ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
++ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device);
++
++/**
++ * Retrieves the name of this device.
++ *
++ * For all products.
++ *
++ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
++ * exceed 96 characters in length (including the NULL terminator).  See \ref
++ * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE.
++ *
++ * When used with MIG device handles the API returns MIG device names which can be used to identify devices
++ * based on their attributes.
++ *
++ * @param device                               The identifier of the target device
++ * @param name                                 Reference in which to return the product name
++ * @param length                               The maximum allowed length of the string returned in \a name
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a name has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
++
++/**
++ * Retrieves the brand of this device.
++ *
++ * For all products.
++ *
++ * The type is a member of \ref nvmlBrandType_t defined above.
++ *
++ * @param device                               The identifier of the target device
++ * @param type                                 Reference in which to return the product brand type
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a name has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a type is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
++
++/**
++ * Retrieves the NVML index of this device.
++ *
++ * For all products.
++ *
++ * Valid indices are derived from the \a accessibleDevices count returned by
++ *   \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices
++ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
++ *
++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
++ *   is recommended that devices be looked up by their PCI ids or GPU UUID. See
++ *   \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID().
++ *
++ * When used with MIG device handles this API returns indices that can be
++ * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle.
++ * MIG device indices are unique within a device.
++ *
++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
++ *
++ * @param device                               The identifier of the target device
++ * @param index                                Reference in which to return the NVML index of the device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a index has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a index is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetHandleByIndex()
++ * @see nvmlDeviceGetCount()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
++
++/**
++ * Retrieves the globally unique board serial number associated with this device's board.
++ *
++ * For all products with an inforom.
++ *
++ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
++ * This number matches the serial number tag that is physically attached to the board.  See \ref
++ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
++ *
++ * @param device                               The identifier of the target device
++ * @param serial                               Reference in which to return the board/module serial number
++ * @param length                               The maximum allowed length of the string returned in \a serial
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a serial has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a serial is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
++
++/**
++ * Get a unique identifier for the device module on the baseboard
++ *
++ * This API retrieves a unique identifier for each GPU module that exists on a given baseboard.
++ * For non-baseboard products, this ID would always be 0.
++ *
++ * @param device                               The identifier of the target device
++ * @param moduleId                             Unique identifier for the GPU module
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a moduleId has been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a moduleId is invalid
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetModuleId(nvmlDevice_t device, unsigned int *moduleId);
++
++/**
++ * Retrieves the Device's C2C Mode information
++ *
++ * @param device                               The identifier of the target device
++ * @param c2cModeInfo                          Output struct containing the device's C2C Mode info
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a C2C Mode Infor query is successful
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a serial is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetC2cModeInfoV(nvmlDevice_t device, nvmlC2cModeInfo_v1_t *c2cModeInfo);
++
++/***************************************************************************************************/
++
++/** @defgroup nvmlAffinity CPU and Memory Affinity
++ *  This chapter describes NVML operations that are associated with CPU and memory
++ *  affinity.
++ *  @{
++ */
++/***************************************************************************************************/
++
++//! Scope of NUMA node for affinity queries
++#define NVML_AFFINITY_SCOPE_NODE     0
++//! Scope of processor socket for affinity queries
++#define NVML_AFFINITY_SCOPE_SOCKET   1
++
++typedef unsigned int nvmlAffinityScope_t;
++
++/**
++ * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with
++ * the ideal memory affinity within node or socket for the device.
++ * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize ==  1,
++ *     result[0] = 0x3
++ *
++ * \note If requested scope is not applicable to the target topology, the API
++ *       will fall back to reporting the memory affinity for the immediate non-I/O
++ *       ancestor of the device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ * @param nodeSetSize                          The size of the nodeSet array that is safe to access
++ * @param nodeSet                              Array reference in which to return a bitmask of NODEs, 64 NODEs per
++ *                                             unsigned long on 64-bit machines, 32 on 32-bit machines
++ * @param scope                                Scope that change the default behavior
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a NUMA node Affinity has been filled
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++
++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope);
++
++/**
++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the
++ * ideal CPU affinity within node or socket for the device.
++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
++ *     result[0] = 0x3, result[1] = 0x3
++ *
++ * \note If requested scope is not applicable to the target topology, the API
++ *       will fall back to reporting the CPU affinity for the immediate non-I/O
++ *       ancestor of the device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ * @param cpuSetSize                           The size of the cpuSet array that is safe to access
++ * @param cpuSet                               Array reference in which to return a bitmask of CPUs, 64 CPUs per
++ *                                                 unsigned long on 64-bit machines, 32 on 32-bit machines
++ * @param scope                                Scope that change the default behavior
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a cpuAffinity has been filled
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++
++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope);
++
++/**
++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
++ *     result[0] = 0x3, result[1] = 0x3
++ * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ * @param cpuSetSize                           The size of the cpuSet array that is safe to access
++ * @param cpuSet                               Array reference in which to return a bitmask of CPUs, 64 CPUs per
++ *                                                 unsigned long on 64-bit machines, 32 on 32-bit machines
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a cpuAffinity has been filled
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
++
++/**
++ * Sets the ideal affinity for the calling thread and device using the guidelines
++ * given in nvmlDeviceGetCpuAffinity().  Note, this is a change as of version 8.0.
++ * Older versions set the affinity for a calling process and all children.
++ * Currently supports up to 1024 processors.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the calling process has been successfully bound
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
++
++/**
++ * Clear all affinity bindings for the calling thread.  Note, this is a change as of version
++ * 8.0 as older versions cleared the affinity for a calling process and all children.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the calling process has been successfully unbound
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
++
++/**
++ * Get the NUMA node of the given GPU device.
++ * This only applies to platforms where the GPUs are NUMA nodes.
++ *
++ * @param[in]      device                  The device handle
++ * @param[out]     node                    NUMA node ID of the device
++ *
++ * @returns
++ *         - \ref NVML_SUCCESS                  if the NUMA node is retrieved successfully
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if request is not supported on the current platform
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device \a node is invalid
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node);
++/**
++ * Retrieve the common ancestor for two devices
++ * For all products.
++ * Supported on Linux only.
++ *
++ * @param device1                              The identifier of the first device
++ * @param device2                              The identifier of the second device
++ * @param pathInfo                             A \ref nvmlGpuTopologyLevel_t that gives the path type
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pathInfo has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
++ */
++
++/** @} */
++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
++
++/**
++ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
++ * For all products.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the first device
++ * @param level                                The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
++ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray
++ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
++ *                                             number of device handles.
++ * @param deviceArray                          An array of device handles for GPUs found at \a level
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
++
++/**
++ * Retrieve the status for a given p2p capability index between a given pair of GPU
++ *
++ * @param device1                              The first device
++ * @param device2                              The second device
++ * @param p2pIndex                             p2p Capability Index being looked for between \a device1 and \a device2
++ * @param p2pStatus                            Reference in which to return the status of the \a p2pIndex
++ *                                             between \a device1 and \a device2
++ * @return
++ *         - \ref NVML_SUCCESS         if \a p2pStatus has been populated
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT     if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL
++ *         - \ref NVML_ERROR_UNKNOWN              on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus);
++
++/**
++ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
++ * that augments the immutable, board serial identifier.
++ *
++ * For all products.
++ *
++ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
++ * It does NOT correspond to any identifier printed on the board.  It will not exceed 96 characters in length
++ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE.
++ *
++ * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG
++ * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device.
++ *
++ * @param device                               The identifier of the target device
++ * @param uuid                                 Reference in which to return the GPU UUID
++ * @param length                               The maximum allowed length of the string returned in \a uuid
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a uuid has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a uuid is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
++
++/**
++ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for
++ * each GPU will have the form /dev/nvidia[minor number].
++ *
++ * For all products.
++ * Supported only for Linux
++ *
++ * @param device                                The identifier of the target device
++ * @param minorNumber                           Reference in which to return the minor number for the device
++ * @return
++ *         - \ref NVML_SUCCESS                 if the minor number is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minorNumber is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
++
++/**
++ * Retrieves the the device board part number which is programmed into the board's InfoROM
++ *
++ * For all products.
++ *
++ * @param device                                Identifier of the target device
++ * @param partNumber                            Reference to the buffer to return
++ * @param length                                Length of the buffer reference
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a partNumber has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if the needed VBIOS fields have not been filled
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a serial is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
++
++/**
++ * Retrieves the version information for the device's infoROM object.
++ *
++ * For all products with an inforom.
++ *
++ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate
++ * ECC counts. The version of the data structures in this memory may change from time to time. It will not
++ * exceed 16 characters in length (including the NULL terminator).
++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
++ *
++ * See \ref nvmlInforomObject_t for details on the available infoROM objects.
++ *
++ * @param device                               The identifier of the target device
++ * @param object                               The target infoROM object
++ * @param version                              Reference in which to return the infoROM version
++ * @param length                               The maximum allowed length of the string returned in \a version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetInforomImageVersion
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
++
++/**
++ * Retrieves the global infoROM image version
++ *
++ * For all products with an inforom.
++ *
++ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board
++ * in contrast to infoROM object version which is only an indicator of supported features.
++ * Version string will not exceed 16 characters in length (including the NULL terminator).
++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
++ *
++ * @param device                               The identifier of the target device
++ * @param version                              Reference in which to return the infoROM image version
++ * @param length                               The maximum allowed length of the string returned in \a version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetInforomVersion
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
++
++/**
++ * Retrieves the checksum of the configuration stored in the device's infoROM.
++ *
++ * For all products with an inforom.
++ *
++ * Can be used to make sure that two GPUs have the exact same configuration.
++ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
++ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
++ *
++ * @param device                               The identifier of the target device
++ * @param checksum                             Reference in which to return the infoROM configuration checksum
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a checksum has been set
++ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a checksum is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
++
++/**
++ * Reads the infoROM from the flash and verifies the checksums.
++ *
++ * For all products with an inforom.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if infoROM is not corrupted
++ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
++
++/**
++ * Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run.
++ *
++ * For all products with an inforom.
++ *
++ * @param device                               The identifier of the target device
++ * @param timestamp                            The start timestamp of the last BBX Flush
++ * @param durationUs                           The duration (us) of the last BBX Flush
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a timestamp and \a durationUs are successfully retrieved
++ *         - \ref NVML_ERROR_NOT_READY         if the BBX object has not been flushed yet
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetInforomVersion
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetLastBBXFlushTime(nvmlDevice_t device, unsigned long long *timestamp,
++                                                   unsigned long *durationUs);
++
++/**
++ * Retrieves the display mode for the device.
++ *
++ * For all products.
++ *
++ * This method indicates whether a physical display (e.g. monitor) is currently connected to
++ * any of the device's connectors.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param display                              Reference in which to return the display mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a display has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a display is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
++
++/**
++ * Retrieves the display active state for the device.
++ *
++ * For all products.
++ *
++ * This method indicates whether a display is initialized on the device.
++ * For example whether X Server is attached to this device and has allocated memory for the screen.
++ *
++ * Display can be active even when no monitor is physically attached.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param isActive                             Reference in which to return the display active state
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a isActive has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isActive is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
++
++/**
++ * Retrieves the persistence mode associated with this device.
++ *
++ * For all products.
++ * For Linux only.
++ *
++ * When driver persistence mode is enabled the driver software state is not torn down when the last
++ * client disconnects. By default this feature is disabled.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 Reference in which to return the current driver persistence mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a mode has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetPersistenceMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
++
++/**
++ * Retrieves PCI attributes of this device.
++ *
++ * For all products.
++ *
++ * See \ref nvmlPciInfoExt_v1_t for details on the available PCI info.
++ *
++ * @param device                               The identifier of the target device
++ * @param pci                                  Reference in which to return the PCI info
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pci has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pci is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfoExt(nvmlDevice_t device, nvmlPciInfoExt_t *pci);
++
++/**
++ * Retrieves the PCI attributes of this device.
++ *
++ * For all products.
++ *
++ * See \ref nvmlPciInfo_t for details on the available PCI info.
++ *
++ * @param device                               The identifier of the target device
++ * @param pci                                  Reference in which to return the PCI info
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pci has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pci is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci);
++
++/**
++ * Retrieves the maximum PCIe link generation possible with this device and system
++ *
++ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
++ * report is generation 1.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param maxLinkGen                           Reference in which to return the max PCIe link generation
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a maxLinkGen has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkGen is null
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
++
++/**
++ * Retrieves the maximum PCIe link generation supported by this device
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param maxLinkGenDevice                     Reference in which to return the max PCIe link generation
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a maxLinkGenDevice has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkGenDevice is null
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGenDevice);
++
++/**
++ * Retrieves the maximum PCIe link width possible with this device and system
++ *
++ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
++ * a max link width of 8.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param maxLinkWidth                         Reference in which to return the max PCIe link generation
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a maxLinkWidth has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkWidth is null
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
++
++/**
++ * Retrieves the current PCIe link generation
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param currLinkGen                          Reference in which to return the current PCIe link generation
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a currLinkGen has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkGen is null
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
++
++/**
++ * Retrieves the current PCIe link width
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param currLinkWidth                        Reference in which to return the current PCIe link generation
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a currLinkWidth has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkWidth is null
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
++
++/**
++ * Retrieve PCIe utilization information.
++ * This function is querying a byte counter over a 20ms interval and thus is the
++ *   PCIe throughput over that interval.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * This method is not supported in virtual machines running virtual GPU (vGPU).
++ *
++ * @param device                               The identifier of the target device
++ * @param counter                              The specific counter that should be queried \ref nvmlPcieUtilCounter_t
++ * @param value                                Reference in which to return throughput in KB/s
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a value has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a counter is invalid, or \a value is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
++
++/**
++ * Retrieve the PCIe replay counter.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param value                                Reference in which to return the counter's value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a value has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a value is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
++
++/**
++ * Retrieves the current clock speeds for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlClockType_t for details on available clock information.
++ *
++ * @param device                               The identifier of the target device
++ * @param type                                 Identify which clock domain to query
++ * @param clock                                Reference in which to return the clock speed in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clock has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
++
++/**
++ * Retrieves the maximum clock speeds for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlClockType_t for details on available clock information.
++ *
++ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
++ *       by few MHz.
++ *
++ * @param device                               The identifier of the target device
++ * @param type                                 Identify which clock domain to query
++ * @param clock                                Reference in which to return the clock speed in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clock has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
++
++/**
++ * Retrieve the GPCCLK VF offset value
++ * @param[in]   device                         The identifier of the target device
++ * @param[out]  offset                         The retrieved GPCCLK VF offset value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset);
++
++/**
++ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
++ * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param clockType                            Identify which clock domain to query
++ * @param clockMHz                             Reference in which to return the clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
++
++/**
++ * Retrieves the default applications clock that GPU boots with or
++ * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param clockType                            Identify which clock domain to query
++ * @param clockMHz                             Reference in which to return the default clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * \see nvmlDeviceGetApplicationsClock
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
++
++/**
++ * Retrieves the clock speed for the clock specified by the clock type and clock ID.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param clockType                            Identify which clock domain to query
++ * @param clockId                              Identify which clock in the domain to query
++ * @param clockMHz                             Reference in which to return the clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
++
++/**
++ * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param clockType                            Identify which clock domain to query
++ * @param clockMHz                             Reference in which to return the clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or the \a clockType on this device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
++
++/**
++ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param count                                Reference in which to provide the \a clocksMHz array size, and
++ *                                             to return the number of elements
++ * @param clocksMHz                            Reference in which to return the clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
++ *                                                required elements)
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetApplicationsClocks
++ * @see nvmlDeviceGetSupportedGraphicsClocks
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
++
++/**
++ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param memoryClockMHz                       Memory clock for which to return possible graphics clocks
++ * @param count                                Reference in which to provide the \a clocksMHz array size, and
++ *                                             to return the number of elements
++ * @param clocksMHz                            Reference in which to return the clocks in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NOT_FOUND         if the specified \a memoryClockMHz is not a supported frequency
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetApplicationsClocks
++ * @see nvmlDeviceGetSupportedMemoryClocks
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
++
++/**
++ * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
++ * to maximize performance as thermal limits allow.
++ *
++ * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks.
++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
++ * behavior.
++ *
++ * @param device                               The identifier of the target device
++ * @param isEnabled                            Where to store the current state of Auto Boosted clocks of the target device
++ * @param defaultIsEnabled                     Where to store the default Auto Boosted clocks behavior of the target device that the device will
++ *                                                 revert to when no applications are using the GPU
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 If \a isEnabled has been been set with the Auto Boosted clocks state of \a device
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isEnabled is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
++
++/**
++ * Retrieves the intended operating speed of the device's fan.
++ *
++ * Note: The reported speed is the intended fan speed.  If the fan is physically blocked and unable to spin, the
++ * output will not match the actual fan speed.
++ *
++ * For all discrete products with dedicated fans.
++ *
++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
++ * This value may exceed 100% in certain cases.
++ *
++ * @param device                               The identifier of the target device
++ * @param speed                                Reference in which to return the fan speed percentage
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a speed has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a speed is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a fan
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
++
++
++/**
++ * Retrieves the intended operating speed of the device's specified fan.
++ *
++ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the
++ * output will not match the actual fan speed.
++ *
++ * For all discrete products with dedicated fans.
++ *
++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
++ * This value may exceed 100% in certain cases.
++ *
++ * @param device                                The identifier of the target device
++ * @param fan                                   The index of the target fan, zero indexed.
++ * @param speed                                 Reference in which to return the fan speed percentage
++ *
++ * @return
++ *        - \ref NVML_SUCCESS                   if \a speed has been set
++ *        - \ref NVML_ERROR_UNINITIALIZED       if the library has not been successfully initialized
++ *        - \ref NVML_ERROR_INVALID_ARGUMENT    if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL
++ *        - \ref NVML_ERROR_NOT_SUPPORTED       if the device does not have a fan or is newer than Maxwell
++ *        - \ref NVML_ERROR_GPU_IS_LOST         if the target GPU has fallen off the bus or is otherwise inaccessible
++ *        - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed);
++
++/**
++ * Retrieves the intended target speed of the device's specified fan.
++ *
++ * Normally, the driver dynamically adjusts the fan based on
++ * the needs of the GPU.  But when user set fan speed using nvmlDeviceSetFanSpeed_v2,
++ * the driver will attempt to make the fan achieve the setting in
++ * nvmlDeviceSetFanSpeed_v2.  The actual current speed of the fan
++ * is reported in nvmlDeviceGetFanSpeed_v2.
++ *
++ * For all discrete products with dedicated fans.
++ *
++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
++ * This value may exceed 100% in certain cases.
++ *
++ * @param device                                The identifier of the target device
++ * @param fan                                   The index of the target fan, zero indexed.
++ * @param targetSpeed                           Reference in which to return the fan speed percentage
++ *
++ * @return
++ *        - \ref NVML_SUCCESS                   if \a speed has been set
++ *        - \ref NVML_ERROR_UNINITIALIZED       if the library has not been successfully initialized
++ *        - \ref NVML_ERROR_INVALID_ARGUMENT    if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL
++ *        - \ref NVML_ERROR_NOT_SUPPORTED       if the device does not have a fan or is newer than Maxwell
++ *        - \ref NVML_ERROR_GPU_IS_LOST         if the target GPU has fallen off the bus or is otherwise inaccessible
++ *        - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed);
++
++/**
++ * Retrieves the min and max fan speed that user can set for the GPU fan.
++ *
++ * For all cuda-capable discrete products with fans
++ *
++ * @param device                        The identifier of the target device
++ * @param minSpeed                      The minimum speed allowed to set
++ * @param maxSpeed                      The maximum speed allowed to set
++ *
++ * return
++ *         NVML_SUCCESS                 if speed has been adjusted
++ *         NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         NVML_ERROR_INVALID_ARGUMENT  if device is invalid
++ *         NVML_ERROR_NOT_SUPPORTED     if the device does not support this
++ *                                      (doesn't have fans)
++ *         NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed,
++                                                 unsigned int * maxSpeed);
++
++/**
++ * Gets current fan control policy.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * For all cuda-capable discrete products with fans
++ *
++ * device                               The identifier of the target \a device
++ * policy                               Reference in which to return the fan control \a policy
++ *
++ * return
++ *         NVML_SUCCESS                 if \a policy has been populated
++ *         NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a policy is null or the \a fan given doesn't reference
++ *                                            a fan that exists.
++ *         NVML_ERROR_NOT_SUPPORTED     if the \a device is older than Maxwell
++ *         NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan,
++                                                      nvmlFanControlPolicy_t *policy);
++
++/**
++ * Retrieves the number of fans on the device.
++ *
++ * For all discrete products with dedicated fans.
++ *
++ * @param device                               The identifier of the target device
++ * @param numFans                              The number of fans
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a fan number query was successful
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a numFans is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a fan
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans);
++
++/**
++   * Retrieves the current temperature readings for the device, in degrees C.
++   *
++   * For all products.
++   *
++   * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
++   *
++   * @param device                               The identifier of the target device
++   * @param sensorType                           Flag that indicates which sensor reading to retrieve
++   * @param temp                                 Reference in which to return the temperature reading
++   *
++   * @return
++   *         - \ref NVML_SUCCESS                 if \a temp has been set
++   *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++   *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a sensorType is invalid or \a temp is NULL
++   *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have the specified sensor
++   *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++   *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++   */
++nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
++
++
++/**
++ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
++ *
++ * Note: This API is no longer the preferred interface for retrieving the following temperature thresholds
++ * on Ada and later architectures: NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN,
++ * NVML_TEMPERATURE_THRESHOLD_MEM_MAX and NVML_TEMPERATURE_THRESHOLD_GPU_MAX.
++ *
++ * Support for reading these temperature thresholds for Ada and later architectures would be removed from this
++ * API in future releases. Please use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_TEMPERATURE_* fields to retrieve
++ * temperature thresholds on these architectures.
++ *
++ * @param device                               The identifier of the target device
++ * @param thresholdType                        The type of threshold value queried
++ * @param temp                                 Reference in which to return the temperature reading
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a temp has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a temperature sensor or is unsupported
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
++
++/**
++ * Used to execute a list of thermal system instructions.
++ *
++ * @param device                               The identifier of the target device
++ * @param sensorIndex                          The index of the thermal sensor
++ * @param pThermalSettings                     Reference in which to return the thermal sensor information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pThermalSettings has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pThermalSettings is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings);
++
++/**
++ * Retrieves the current performance state for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlPstates_t for details on allowed performance states.
++ *
++ * @param device                               The identifier of the target device
++ * @param pState                               Reference in which to return the performance state reading
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pState has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
++
++/**
++ * Retrieves current clocks event reasons.
++ *
++ * For all fully supported products.
++ *
++ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
++ *
++ * @param device                                The identifier of the target device
++ * @param clocksEventReasons                    Reference in which to return bitmask of active clocks event
++ *                                                  reasons
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a clocksEventReasons has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clocksEventReasons is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlClocksEventReasons
++ * @see nvmlDeviceGetSupportedClocksEventReasons
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, unsigned long long *clocksEventReasons);
++
++/**
++ * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
++
++/**
++ * Retrieves bitmask of supported clocks event reasons that can be returned by
++ * \ref nvmlDeviceGetCurrentClocksEventReasons
++ *
++ * For all fully supported products.
++ *
++ * This method is not supported in virtual machines running virtual GPU (vGPU).
++ *
++ * @param device                               The identifier of the target device
++ * @param supportedClocksEventReasons       Reference in which to return bitmask of supported
++ *                                              clocks event reasons
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a supportedClocksEventReasons has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a supportedClocksEventReasons is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlClocksEventReasons
++ * @see nvmlDeviceGetCurrentClocksEventReasons
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t device, unsigned long long *supportedClocksEventReasons);
++
++/**
++ * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
++
++/**
++ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
++ *
++ * Retrieve the current performance state for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlPstates_t for details on allowed performance states.
++ *
++ * @param device                               The identifier of the target device
++ * @param pState                               Reference in which to return the performance state reading
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pState has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
++
++/**
++ * Retrieve performance monitor samples from the associated subdevice.
++ *
++ * @param device
++ * @param pDynamicPstatesInfo
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pDynamicPstatesInfo has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pDynamicPstatesInfo is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo);
++
++/**
++ * Retrieve the MemClk (Memory Clock) VF offset value.
++ * @param[in]   device                         The identifier of the target device
++ * @param[out]  offset                         The retrieved MemClk VF offset value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset);
++
++/**
++ * Retrieve min and max clocks of some clock domain for a given PState
++ *
++ * @param device                               The identifier of the target device
++ * @param type                                 Clock domain
++ * @param pstate                               PState to query
++ * @param minClockMHz                          Reference in which to return min clock frequency
++ * @param maxClockMHz                          Reference in which to return max clock frequency
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if everything worked
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a type or \a pstate are invalid or both
++ *                                                  \a minClockMHz and \a maxClockMHz are NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate,
++                                                      unsigned int * minClockMHz, unsigned int * maxClockMHz);
++
++/**
++ * Get all supported Performance States (P-States) for the device.
++ *
++ * The returned array would contain a contiguous list of valid P-States supported by
++ * the device. If the number of supported P-States is fewer than the size of the array
++ * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN.
++ *
++ * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES.
++ *
++ * @param device                               The identifier of the target device
++ * @param pstates                              Container to return the list of performance states
++ *                                             supported by device
++ * @param size                                 Size of the supplied \a pstates array in bytes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pstates array has been retrieved
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to
++ *                                             hold the resulting list
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a pstates is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support performance state readings
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device,
++                                                             nvmlPstates_t *pstates, unsigned int size);
++
++/**
++ * Retrieve the GPCCLK min max VF offset value.
++ * @param[in]   device                         The identifier of the target device
++ * @param[out]  minOffset                      The retrieved GPCCLK VF min offset value
++ * @param[out]  maxOffset                      The retrieved GPCCLK VF max offset value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device,
++                                                       int *minOffset, int *maxOffset);
++
++/**
++ * Retrieve the MemClk (Memory Clock) min max VF offset value.
++ * @param[in]   device                         The identifier of the target device
++ * @param[out]  minOffset                      The retrieved MemClk VF min offset value
++ * @param[out]  maxOffset                      The retrieved MemClk VF max offset value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device,
++                                                       int *minOffset, int *maxOffset);
++
++/**
++ * Retrieve min, max and current clock offset of some clock domain for a given PState
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Note: \ref nvmlDeviceGetGpcClkVfOffset, \ref nvmlDeviceGetMemClkVfOffset, \ref nvmlDeviceGetGpcClkMinMaxVfOffset and
++ *       \ref nvmlDeviceGetMemClkMinMaxVfOffset will be deprecated in a future release.
++         Use \ref nvmlDeviceGetClockOffsets instead.
++ *
++ * @param device                               The identifier of the target device
++ * @param info                                 Structure specifying the clock type (input) and the pstate (input)
++ *                                             retrieved clock offset value (output), min clock offset (output)
++ *                                             and max clock offset (output)
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                         if everything worked
++ *         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device, \a type or \a pstate are invalid or both
++ *                                                             \a minClockOffsetMHz and \a maxClockOffsetMHz are NULL
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported
++ *         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info);
++
++/**
++ * Control current clock offset of some clock domain for a given PState
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Requires privileged user.
++ *
++ * @param device                               The identifier of the target device
++ * @param info                                 Structure specifying the clock type (input), the pstate (input)
++ *                                             and clock offset value (input)
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                         if everything worked
++ *         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NO_PERMISSION             if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device, \a type or \a pstate are invalid or both
++ *                                                             \a clockOffsetMHz is out of allowed range.
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported
++ *         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info);
++
++/**
++ * This API has been deprecated.
++ *
++ * Retrieves the power management mode associated with this device.
++ *
++ * For products from the Fermi family.
++ *     - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
++ *
++ * For from the Kepler or newer families.
++ *     - Does not require \a NVML_INFOROM_POWER object.
++ *
++ * This flag indicates whether any power management algorithm is currently active on the device. An
++ * enabled state does not necessarily mean the device is being actively throttled -- only that
++ * that the driver will do so if the appropriate conditions are met.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 Reference in which to return the current power management mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a mode has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
++
++/**
++ * Retrieves the power management limit associated with this device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * The power limit defines the upper boundary for the card's power draw. If
++ * the card's total power draw reaches this limit the power management algorithm kicks in.
++ *
++ * This reading is only available if power management mode is supported.
++ * See \ref nvmlDeviceGetPowerManagementMode.
++ *
++ * @param device                               The identifier of the target device
++ * @param limit                                Reference in which to return the power management limit in milliwatts
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a limit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
++
++/**
++ * Retrieves information about possible values of power management limits on this device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param minLimit                             Reference in which to return the minimum power management limit in milliwatts
++ * @param maxLimit                             Reference in which to return the maximum power management limit in milliwatts
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a minLimit and \a maxLimit have been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minLimit or \a maxLimit is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetPowerManagementLimit
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
++
++/**
++ * Retrieves default power management limit on this device, in milliwatts.
++ * Default power management limit is a power management limit that the device boots with.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param defaultLimit                         Reference in which to return the default power management limit in milliwatts
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a defaultLimit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
++
++/**
++ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. On Ampere
++ * (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval. On GA100 and
++ * older architectures, instantaneous power is returned.
++ *
++ * See \ref NVML_FI_DEV_POWER_AVERAGE and \ref NVML_FI_DEV_POWER_INSTANT to query specific power
++ * values.
++ *
++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
++ *
++ * @param device                               The identifier of the target device
++ * @param power                                Reference in which to return the power usage information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a power has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a power is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support power readings
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
++
++/**
++ * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
++ *
++ * For Volta &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param energy                               Reference in which to return the energy consumption information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a energy has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a energy is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support energy readings
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy);
++
++/**
++ * Get the effective power limit that the driver enforces after taking into account all limiters
++ *
++ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
++ * This includes the out of band power limit interface
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                           The device to communicate with
++ * @param limit                            Reference in which to return the power management limit in milliwatts
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a limit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
++
++/**
++ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
++ *
++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
++ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
++ *
++ * @param device                               The identifier of the target device
++ * @param current                              Reference in which to return the current GOM
++ * @param pending                              Reference in which to return the pending GOM
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a mode has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a current or \a pending is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlGpuOperationMode_t
++ * @see nvmlDeviceSetGpuOperationMode
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
++
++/**
++ * Retrieves the amount of used, free, reserved and total memory available on the device, in bytes.
++ * The reserved amount is supported on version 2 only.
++ *
++ * For all products.
++ *
++ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
++ * Under WDDM most device memory is allocated and managed on startup by Windows.
++ *
++ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated
++ * by all active channels on the device.
++ *
++ * See \ref nvmlMemory_v2_t for details on available memory info.
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate
++ *       information, only if the caller has appropriate privileges. Per-instance
++ *       information can be queried by using specific MIG device handles.
++ *
++ * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information.
++ *
++ * @note On systems where GPUs are NUMA nodes, the accuracy of FB memory utilization
++ *       provided by this API depends on the memory accounting of the operating system.
++ *       This is because FB memory is managed by the operating system instead of the NVIDIA GPU driver.
++ *       Typically, pages allocated from FB memory are not released even after
++ *       the process terminates to enhance performance. In scenarios where
++ *       the operating system is under memory pressure, it may resort to utilizing FB memory.
++ *       Such actions can result in discrepancies in the accuracy of memory reporting.
++ *
++ * @param device                               The identifier of the target device
++ * @param memory                               Reference in which to return the memory information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a memory has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
++
++/**
++ * nvmlDeviceGetMemoryInfo_v2 accounts separately for reserved memory and includes it in the used memory amount.
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory);
++
++/**
++ * Retrieves the current compute mode for the device.
++ *
++ * For all products.
++ *
++ * See \ref nvmlComputeMode_t for details on allowed compute modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 Reference in which to return the current compute mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a mode has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetComputeMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
++
++/**
++ * Retrieves the CUDA compute capability of the device.
++ *
++ * For all products.
++ *
++ * Returns the major and minor compute capability version numbers of the
++ * device.  The major and minor versions are equivalent to the
++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and
++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be
++ * returned by CUDA's cuDeviceGetAttribute().
++ *
++ * @param device                               The identifier of the target device
++ * @param major                                Reference in which to return the major CUDA compute capability
++ * @param minor                                Reference in which to return the minor CUDA compute capability
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a major and \a minor have been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a major or \a minor are NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor);
++
++/**
++ * Retrieves the current and pending ECC modes for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
++ *
++ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
++ * the next reboot.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param current                              Reference in which to return the current ECC mode
++ * @param pending                              Reference in which to return the pending ECC mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a current and \a pending have been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or either \a current or \a pending is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetEccMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
++
++/**
++ * Retrieves the default ECC modes for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
++ *
++ * See \ref nvmlEnableState_t for details on allowed modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param defaultMode                          Reference in which to return the default ECC mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a current and \a pending have been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a default is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetEccMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode);
++
++/**
++ * Retrieves the device boardId from 0-N.
++ * Devices with the same boardId indicate GPUs connected to the same PLX.  Use in conjunction with
++ *  \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
++ *  The boardId returned is a unique ID for the current configuration.  Uniqueness and ordering across
++ *  reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
++ *  the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will
++ *  always return those values but they will always be different from each other).
++ *
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param boardId                              Reference in which to return the device's board ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a boardId has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a boardId is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
++
++/**
++ * Retrieves whether the device is on a Multi-GPU Board
++ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param multiGpuBool                         Reference in which to return a zero or non-zero value
++ *                                                 to indicate whether the device is on a multi GPU board
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a multiGpuBool has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a multiGpuBool is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
++
++/**
++ * Retrieves the total ECC error counts for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
++ * Requires ECC Mode to be enabled.
++ *
++ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of
++ * errors across the entire device.
++ *
++ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
++ * See \ref nvmlEccCounterType_t for a description of available counter types.
++ *
++ * @param device                               The identifier of the target device
++ * @param errorType                            Flag that specifies the type of the errors.
++ * @param counterType                          Flag that specifies the counter-type of the errors.
++ * @param eccCounts                            Reference in which to return the specified ECC errors
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a eccCounts has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceClearEccErrorCounts()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
++
++/**
++ * Retrieves the detailed ECC error counts for the device.
++ *
++ * @deprecated   This API supports only a fixed set of ECC error locations
++ *               On different GPU architectures different locations are supported
++ *               See \ref nvmlDeviceGetMemoryErrorCounter
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
++ * Requires ECC Mode to be enabled.
++ *
++ * Detailed errors provide separate ECC counts for specific parts of the memory system.
++ *
++ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
++ *
++ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
++ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
++ *
++ * @param device                               The identifier of the target device
++ * @param errorType                            Flag that specifies the type of the errors.
++ * @param counterType                          Flag that specifies the counter-type of the errors.
++ * @param eccCounts                            Reference in which to return the specified ECC errors
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a eccCounts has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceClearEccErrorCounts()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
++
++/**
++ * Retrieves the requested memory error counter for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
++ *
++ * Only applicable to devices with ECC.
++ *
++ * Requires ECC Mode to be enabled.
++ *
++ * @note On MIG-enabled GPUs, per instance information can be queried using specific
++ *       MIG device handles. Per instance information is currently only supported for
++ *       non-DRAM uncorrectable volatile errors. Querying volatile errors using device
++ *       handles is currently not supported.
++ *
++ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
++ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
++ *
++ * @param device                               The identifier of the target device
++ * @param errorType                            Flag that specifies the type of error.
++ * @param counterType                          Flag that specifies the counter-type of the errors.
++ * @param locationType                         Specifies the location of the counter.
++ * @param count                                Reference in which to return the ECC counter
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a count has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a bitTyp,e \a counterType or \a locationType is
++ *                                             invalid, or \a count is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support ECC error reporting in the specified memory
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
++                                                   nvmlEccCounterType_t counterType,
++                                                   nvmlMemoryLocation_t locationType, unsigned long long *count);
++
++/**
++ * Retrieves the current utilization rates for the device's major subsystems.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlUtilization_t for details on available utilization rates.
++ *
++ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
++ *       This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
++ *
++ * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported.
++ *
++ * @param device                               The identifier of the target device
++ * @param utilization                          Reference in which to return the utilization information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a utilization is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
++
++/**
++ * Retrieves the current utilization and sampling size in microseconds for the Encoder
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported.
++ *
++ * @param device                               The identifier of the target device
++ * @param utilization                          Reference to an unsigned int for encoder utilization info
++ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
++
++/**
++ * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param encoderQueryType                  Type of encoder to query
++ * @param encoderCapacity                   Reference to an unsigned int for the encoder capacity
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a encoderCapacity is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a encoderCapacity is NULL, or \a device or \a encoderQueryType
++ *                                              are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if device does not support the encoder specified in \a encodeQueryType
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity);
++
++/**
++ * Retrieves the current encoder statistics for a given device.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
++ * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
++ * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount, or \a device or \a averageFps,
++ *                                              or \a averageLatency is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount,
++                                                unsigned int *averageFps, unsigned int *averageLatency);
++
++/**
++ * Retrieves information about active encoder sessions on a target device.
++ *
++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
++ * written to the buffer.
++ *
++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns
++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
++ * To query the number of active encoder sessions, call this function with *sessionCount = 0.  The code will return
++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
++ * @param sessionInfos                      Reference in which to return the session information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a sessionInfos is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if this query is not supported by \a device
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos);
++
++/**
++ * Retrieves the current utilization and sampling size in microseconds for the Decoder
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported.
++ *
++ * @param device                               The identifier of the target device
++ * @param utilization                          Reference to an unsigned int for decoder utilization info
++ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
++
++/**
++ * Retrieves the current utilization and sampling size in microseconds for the JPG
++ *
++ * %TURING_OR_NEWER%
++ *
++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported.
++ *
++ * @param device                               The identifier of the target device
++ * @param utilization                          Reference to an unsigned int for jpg utilization info
++ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetJpgUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
++
++/**
++ * Retrieves the current utilization and sampling size in microseconds for the OFA (Optical Flow Accelerator)
++ *
++ * %TURING_OR_NEWER%
++ *
++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported.
++ *
++ * @param device                               The identifier of the target device
++ * @param utilization                          Reference to an unsigned int for ofa utilization info
++ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetOfaUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
++
++/**
++* Retrieves the active frame buffer capture sessions statistics for a given device.
++*
++* For Maxwell &tm; or newer fully supported devices.
++*
++* @param device                            The identifier of the target device
++* @param fbcStats                          Reference to nvmlFBCStats_t structure containing NvFBC stats
++*
++* @return
++*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a fbcStats is NULL
++*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats);
++
++/**
++* Retrieves information about active frame buffer capture sessions on a target device.
++*
++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
++* written to the buffer.
++*
++* If the supplied buffer is not large enough to accommodate the active session array, the function returns
++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
++* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
++*
++* For Maxwell &tm; or newer fully supported devices.
++*
++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
++*       be zero if there are no new frames captured since the session started.
++*
++* @param device                            The identifier of the target device
++* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
++* @param sessionInfo                       Reference in which to return the session information
++*
++* @return
++*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
++*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
++
++/**
++ * Retrieves the current and pending driver model for the device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * For windows only.
++ *
++ * On Windows platforms the device driver can run in either WDDM, MCDM or WDM (TCC) modes. If a display is attached
++ * to the device it must run in WDDM mode. MCDM mode is preferred if a display is not attached. TCC mode is deprecated.
++ *
++ * See \ref nvmlDriverModel_t for details on available driver models.
++ *
++ * @param device                               The identifier of the target device
++ * @param current                              Reference in which to return the current driver model
++ * @param pending                              Reference in which to return the pending driver model
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if either \a current and/or \a pending have been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or both \a current and \a pending are NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceSetDriverModel_v2()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
++
++/**
++ * Get VBIOS version of the device.
++ *
++ * For all products.
++ *
++ * The VBIOS version may change from time to time. It will not exceed 32 characters in length
++ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
++ *
++ * @param device                               The identifier of the target device
++ * @param version                              Reference to which to return the VBIOS version
++ * @param length                               The maximum allowed length of the string returned in \a version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a version is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
++
++/**
++ * Get Bridge Chip Information for all the bridge chips on the board.
++ *
++ * For all fully supported products.
++ * Only applicable to multi-GPU products.
++ *
++ * @param device                                The identifier of the target device
++ * @param bridgeHierarchy                       Reference to the returned bridge chip Hierarchy
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if bridge chip exists
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a bridgeInfo is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if bridge chip not supported on the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
++
++/**
++ * Get information about processes with a compute context on a device
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * This function returns information only about compute running processes (e.g. CUDA application which have
++ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
++ *
++ * To query the current number of running compute processes, call this function with *infoCount = 0. The
++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
++ * \a infos is allowed to be NULL.
++ *
++ * The usedGpuMemory field returned is all of the memory used by the application.
++ *
++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
++ * time. Allocate more space for \a infos table in case new compute processes are spawned.
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
++ *       the caller has appropriate privileges. Per-instance information can be queried by using
++ *       specific MIG device handles.
++ *       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
++ *
++ * @param device                               The device handle or MIG device handle
++ * @param infoCount                            Reference in which to provide the \a infos array size, and
++ *                                             to return the number of returned elements
++ * @param infos                                Reference in which to return the process information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
++ *                                             \a infoCount will contain minimal amount of space necessary for
++ *                                             the call to complete
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by \a device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see \ref nvmlSystemGetProcessName
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
++
++/**
++ * Get information about processes with a graphics context on a device
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * This function returns information only about graphics based processes
++ * (eg. applications using OpenGL, DirectX)
++ *
++ * To query the current number of running graphics processes, call this function with *infoCount = 0. The
++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
++ * \a infos is allowed to be NULL.
++ *
++ * The usedGpuMemory field returned is all of the memory used by the application.
++ *
++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
++ * time. Allocate more space for \a infos table in case new graphics processes are spawned.
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
++ *       the caller has appropriate privileges. Per-instance information can be queried by using
++ *       specific MIG device handles.
++ *       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
++ *
++ * @param device                               The device handle or MIG device handle
++ * @param infoCount                            Reference in which to provide the \a infos array size, and
++ *                                             to return the number of returned elements
++ * @param infos                                Reference in which to return the process information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
++ *                                             \a infoCount will contain minimal amount of space necessary for
++ *                                             the call to complete
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by \a device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see \ref nvmlSystemGetProcessName
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
++
++/**
++ * Get information about processes with a Multi-Process Service (MPS) compute context on a device
++ *
++ * For Volta &tm; or newer fully supported devices.
++ *
++ * This function returns information only about compute running processes (e.g. CUDA application which have
++ * active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by
++ * this function.
++ *
++ * To query the current number of running compute processes, call this function with *infoCount = 0. The
++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
++ * \a infos is allowed to be NULL.
++ *
++ * The usedGpuMemory field returned is all of the memory used by the application.
++ *
++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
++ * time. Allocate more space for \a infos table in case new compute processes are spawned.
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
++ *       the caller has appropriate privileges. Per-instance information can be queried by using
++ *       specific MIG device handles.
++ *       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
++ *
++ * @param device                               The device handle or MIG device handle
++ * @param infoCount                            Reference in which to provide the \a infos array size, and
++ *                                             to return the number of returned elements
++ * @param infos                                Reference in which to return the process information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
++ *                                             \a infoCount will contain minimal amount of space necessary for
++ *                                             the call to complete
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by \a device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see \ref nvmlSystemGetProcessName
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
++
++/**
++ * Get information about running processes on a device for input context
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * This function returns information only about running processes (e.g. CUDA application which have
++ * active context).
++ *
++ * To determine the size of the \a plist->procArray array to allocate, call the function with
++ * \a plist->numProcArrayEntries set to zero and \a plist->procArray set to NULL. The return
++ * code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type
++ * \a plist->mode to report on, in which case the \a plist->numProcArrayEntries field will
++ * indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type
++ * \a plist->mode exist).
++ *
++ * The usedGpuMemory field returned is all of the memory used by the application.
++ * The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application.
++ *
++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
++ * time. Allocate more space for \a plist->procArray table in case new processes are spawned.
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
++ *       the caller has appropriate privileges. Per-instance information can be queried by using
++ *       specific MIG device handles.
++ *       Querying per-instance information using MIG device handles is not supported if the device is in
++ *       vGPU Host virtualization mode.
++ *       Protected memory usage is currently not available in MIG mode and in windows.
++ *
++ * @param device                               The device handle or MIG device handle
++ * @param plist                                Reference in which to process detail list
++ * \a plist->version                       The api version
++ * \a plist->mode                          The process mode
++ * \a plist->procArray                     Reference in which to return the process information
++ * \a plist->numProcArrayEntries           Proc array size of returned entries
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a plist->numprocArrayEntries and \a plist->procArray have been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a plist->numprocArrayEntries indicates that the \a plist->procArray is too small
++ *                                             \a plist->numprocArrayEntries will contain minimal amount of space necessary for
++ *                                             the call to complete
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a plist is NULL, \a plist->version is invalid,
++ *                                             \a plist->mode is invalid,
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by \a device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t *plist);
++
++/**
++ * Check if the GPU devices are on the same physical board.
++ *
++ * For all fully supported products.
++ *
++ * @param device1                               The first GPU device
++ * @param device2                               The second GPU device
++ * @param onSameBoard                           Reference in which to return the status.
++ *                                              Non-zero indicates that the GPUs are on the same board.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a onSameBoard has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this check is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the either GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
++
++/**
++ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
++ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
++ *
++ * For all fully supported products.
++ *
++ * @param device                               The identifier of the target device
++ * @param apiType                              Target API type for this operation
++ * @param isRestricted                         Reference in which to return the current restriction
++ *                                             NVML_FEATURE_ENABLED indicates that the API is root-only
++ *                                             NVML_FEATURE_DISABLED indicates that the API is accessible to all users
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device or the device does not support
++ *                                                 the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is
++ *                                                 not supported by the device)
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlRestrictedAPI_t
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
++
++/**
++ * Gets recent samples for the GPU.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by
++ * the driver.
++ *
++ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
++ *
++ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL.
++ * The returned samplesCount will provide the number of samples that can be queried. The user needs to
++ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
++ *
++ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the
++ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query
++ * to get more recent samples.
++ *
++ * This method fetches the number of entries which can be accommodated in the provided samples array, and the
++ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this
++ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
++ *
++ * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES
++ *       NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported.
++ *
++ * @param device                        The identifier for the target device
++ * @param type                          Type of sampling event
++ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
++ * @param sampleValType                 Output parameter to represent the type of sample value as described in nvmlSampleVal_t
++ * @param sampleCount                   Reference to provide the number of elements which can be queried in samples array
++ * @param samples                       Reference in which samples are returned
++
++ * @return
++ *         - \ref NVML_SUCCESS                 if samples are successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a samplesCount is NULL or
++ *                                             reference to \a sampleCount is 0 for non null \a samples
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
++        nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
++
++/**
++ * Gets Total, Available and Used size of BAR1 memory.
++ *
++ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party
++ * devices (peer-to-peer on the PCIE bus).
++ *
++ * @note In MIG mode, if device handle is provided, the API returns aggregate
++ *       information, only if the caller has appropriate privileges. Per-instance
++ *       information can be queried by using specific MIG device handles.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param bar1Memory                           Reference in which BAR1 memory
++ *                                             information is returned.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if BAR1 memory is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a bar1Memory is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
++
++/**
++ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
++ * or thermal constraints.
++ *
++ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
++ * difference in violation times at two different reference times gives the indication of GPU throttling event.
++ *
++ * Violation for thermal capping is not supported at this time.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param perfPolicyType                       Represents Performance policy which can trigger GPU throttling
++ * @param violTime                             Reference to which violation time related information is returned
++ *
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if violation time is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
++
++/**
++ * Gets the device's interrupt number
++ *
++ * @param device                               The identifier of the target device
++ * @param irqNum                               The interrupt number associated with the specified device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if irq number is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a irqNum is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqNum);
++
++/**
++ * Gets the device's core count
++ *
++ * @param device                               The identifier of the target device
++ * @param numCores                             The number of cores for the specified device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if Gpu core count is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a numCores is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int *numCores);
++
++/**
++ * Gets the devices power source
++ *
++ * @param device                               The identifier of the target device
++ * @param powerSource                          The power source of the device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the current power source was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a powerSource is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t *powerSource);
++
++/**
++ * Gets the device's memory bus width
++ *
++ * @param device                               The identifier of the target device
++ * @param busWidth                             The devices's memory bus width
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the memory bus width is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a busWidth is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int *busWidth);
++
++/**
++ * Gets the device's PCIE Max Link speed in MBPS
++ *
++ * @param device                               The identifier of the target device
++ * @param maxSpeed                             The devices's PCIE Max Link speed in MBPS
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if Pcie Max Link Speed is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a maxSpeed is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed);
++
++/**
++ * Gets the device's PCIe Link speed in Mbps
++ *
++ * @param device                               The identifier of the target device
++ * @param pcieSpeed                            The devices's PCIe Max Link speed in Mbps
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pcieSpeed has been retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pcieSpeed is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support PCIe speed getting
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed);
++
++/**
++ * Gets the device's Adaptive Clock status
++ *
++ * @param device                               The identifier of the target device
++ * @param adaptiveClockStatus                  The current adaptive clocking status, either
++ *                                             \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED
++ *                                             or \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the current adaptive clocking status is successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a adaptiveClockStatus is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus);
++
++/**
++ * Get the type of the GPU Bus (PCIe, PCI, ...)
++ *
++ * @param device                               The identifier of the target device
++ * @param type                                 The PCI Bus type
++ *
++ * return
++ *         - \ref NVML_SUCCESS                 if the bus \a type is successfully retreived
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a type is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type);
++
++
++ /**
++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead
++ *
++ * Get fabric information associated with the device.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager
++ * Upon successful registration, the GPU is added to the NVLink fabric to enable
++ * peer-to-peer communication.
++ * This API reports the current state of the GPU in the NVLink fabric
++ * along with other useful information.
++ *
++ *
++ * @param device                               The identifier of the target device
++ * @param gpuFabricInfo                        Information about GPU fabric state
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support gpu fabric
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo);
++
++/**
++* Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned
++* \ref nvmlGpuFabricInfo_v2_t or later output structure.
++*
++* @note The caller must set the \ref nvmlGpuFabricInfoV_t.version field to the
++* appropriate version prior to calling this function. For example:
++* \code
++*     nvmlGpuFabricInfoV_t fabricInfo =
++*         { .version = nvmlGpuFabricInfo_v2 };
++*     nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo);
++* \endcode
++*
++* For Hopper &tm; or newer fully supported devices.
++*
++* @param device                               The identifier of the target device
++* @param gpuFabricInfo                        Information about GPU fabric state
++*
++* @return
++*         - \ref NVML_SUCCESS                 Upon success
++*         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support gpu fabric
++*/
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device,
++                                                 nvmlGpuFabricInfoV_t *gpuFabricInfo);
++
++/**
++ * Set new power limit of this device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
++ *
++ * See \ref nvmlPowerValue_v2_t for more information on the struct.
++ *
++ * \note Limit is not persistent across reboots or driver unloads.
++ * Enable persistent mode to prevent driver from unloading when no application is using the device.
++ *
++ * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version.
++ *
++ * @param device                               The identifier of the target device
++ * @param powerValue                           Power management limit in milliwatts to set
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a limit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a powerValue is NULL or contains invalid values
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see NVML_FI_DEV_POWER_AVERAGE
++ * @see NVML_FI_DEV_POWER_INSTANT
++ * @see NVML_FI_DEV_POWER_MIN_LIMIT
++ * @see NVML_FI_DEV_POWER_MAX_LIMIT
++ * @see NVML_FI_DEV_POWER_CURRENT_LIMIT
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue);
++
++/**
++ * Get SRAM ECC error status of this device.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct.
++ *
++ * @param device                               The identifier of the target device
++ * @param status                               Returns SRAM ECC error status
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          if \a limit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED              if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           if \a device is invalid or \a counters is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST                if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  if the version of \a nvmlEccSramErrorStatus_t is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device,
++                                                     nvmlEccSramErrorStatus_t *status);
++
++/**
++ * Get Conf Computing System capabilities.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param capabilities                         System CC capabilities
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a capabilities were successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a capabilities is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities);
++
++/**
++ * Get Conf Computing System State.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param state                                System CC State
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a state were successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a state is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state);
++
++/**
++ * Get Conf Computing Protected and Unprotected Memory Sizes.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               Device handle
++ * @param memInfo                              Protected/Unprotected Memory sizes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a memInfo were successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a memInfo or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo);
++
++/**
++ * Get Conf Computing GPUs ready state.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param isAcceptingWork                      Returns GPU current work accepting state,
++ *                                             NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or
++ *                                             NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE
++ *
++ * return
++ *         - \ref NVML_SUCCESS                 if \a current GPUs ready state were successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a isAcceptingWork is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork);
++
++/**
++ * Get Conf Computing protected memory usage.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               The identifier of the target device
++ * @param memory                               Reference in which to return the memory information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a memory has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory);
++
++/**
++ * Get Conf Computing Gpu certificate details.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               The identifier of the target device
++ * @param gpuCert                              Reference in which to return the gpu certificate information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a gpu certificate info has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device,
++                                                            nvmlConfComputeGpuCertificate_t *gpuCert);
++
++/**
++ * Get Conf Computing Gpu attestation report.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               The identifier of the target device
++ * @param gpuAtstReport                        Reference in which to return the gpu attestation report
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a gpu attestation report has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device,
++                                                                  nvmlConfComputeGpuAttestationReport_t *gpuAtstReport);
++/**
++ * Get Conf Computing key rotation threshold detail.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param pKeyRotationThrInfo                  Reference in which to return the key rotation threshold data
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a gpu key rotation threshold info has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeKeyRotationThresholdInfo(
++                          nvmlConfComputeGetKeyRotationThresholdInfo_t *pKeyRotationThrInfo);
++
++/**
++ * Set Conf Computing Unprotected Memory Size.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               Device Handle
++ * @param sizeKiB                              Unprotected Memory size to be set in KiB
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a sizeKiB successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB);
++
++/**
++ * Set Conf Computing GPUs ready state.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param isAcceptingWork                      GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or
++ *                                             NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE
++ *
++ * return
++ *         - \ref NVML_SUCCESS                 if \a current GPUs ready state is successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a isAcceptingWork is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork);
++
++/**
++ * Set Conf Computing key rotation threshold.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * This function is to set the confidential compute key rotation threshold parameters.
++ * \a pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from
++ * NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX.
++ * Default value is 60.
++ *
++ * @param pKeyRotationThrInfo                  Reference to the key rotation threshold data
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a key rotation threashold max attacker advantage has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
++ *         - \ref NVML_ERROR_INVALID_STATE     if confidential compute GPU ready state is enabled
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeKeyRotationThresholdInfo(
++                          nvmlConfComputeSetKeyRotationThresholdInfo_t *pKeyRotationThrInfo);
++
++/**
++ * Get Conf Computing System Settings.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param settings                                     System CC settings
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                         if the query is success
++ *         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device is invalid or \a counters is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST               if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported
++ *         - \ref NVML_ERROR_UNKNOWN                   on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeSettings(nvmlSystemConfComputeSettings_t *settings);
++
++/**
++ * Retrieve GSP firmware version.
++ *
++ * The caller passes in buffer via \a version and corresponding GSP firmware numbered version
++ * is returned with the same parameter in string format.
++ *
++ * @param device                               Device handle
++ * @param version                              The retrieved GSP firmware version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if GSP firmware version is sucessfully retrieved
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or GSP \a version pointer is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if GSP firmware is not enabled for GPU
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version);
++
++/**
++ * Retrieve GSP firmware mode.
++ *
++ * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with
++ * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean.
++ *
++ * @param device                               Device handle
++ * @param isEnabled                            Pointer to specify if GSP firmware is enabled
++ * @param defaultMode                          Pointer to specify if GSP firmware is supported by default on \a device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if GSP firmware mode is sucessfully retrieved
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if GSP firmware is not enabled for GPU
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode);
++
++/**
++ * @}
++ */
++
++/** @addtogroup nvmlAccountingStats
++ *  @{
++ */
++
++/**
++ * Queries the state of per process accounting mode.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlDeviceGetAccountingStats for more details.
++ * See \ref nvmlDeviceSetAccountingMode
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 Reference in which to return the current accounting mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode are NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
++
++/**
++ * Queries process's accounting stats.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
++ * Accounting stats can be queried during life time of the process and after its termination.
++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and
++ * updated to actual running time after its termination.
++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
++ * processes.
++ *
++ * See \ref nvmlAccountingStats_t for description of each returned metric.
++ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
++ *
++ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
++ *         queried since they don't contribute to GPU utilization.
++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
++ *
++ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
++ *
++ * @param device                               The identifier of the target device
++ * @param pid                                  Process Id of the target process to query stats for
++ * @param stats                                Reference in which to return the process's accounting stats
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a stats are NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if process stats were not found
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if \a device doesn't support this feature or accounting mode is disabled
++ *                                              or on vGPU host.
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetAccountingBufferSize
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
++
++/**
++ * Queries list of processes that can be queried for accounting stats. The list of processes returned
++ * can be in running or terminated state.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * To just query the number of processes ready to be queried, call this function with *count = 0 and
++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
++ *
++ * For more details see \ref nvmlDeviceGetAccountingStats.
++ *
++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
++ *
++ * @param device                               The identifier of the target device
++ * @param count                                Reference in which to provide the \a pids array size, and
++ *                                               to return the number of elements ready to be queried
++ * @param pids                                 Reference in which to return list of process ids
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if \a device doesn't support this feature or accounting mode is disabled
++ *                                              or on vGPU host.
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
++ *                                                 expected value)
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetAccountingBufferSize
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
++
++/**
++ * Returns the number of processes that the circular buffer with accounting pids can hold.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * This is the maximum number of processes that accounting information will be stored for before information
++ * about oldest processes will get overwritten by information about new processes.
++ *
++ * @param device                               The identifier of the target device
++ * @param bufferSize                           Reference in which to provide the size (in number of elements)
++ *                                               of the circular buffer for accounting stats.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if buffer size was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a bufferSize is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetAccountingStats
++ * @see nvmlDeviceGetAccountingPids
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
++
++/** @} */
++
++/** @addtogroup nvmlDeviceQueries
++ *  @{
++ */
++
++/**
++ * Returns the list of retired pages by source, including pages that are pending retirement
++ * The address information provided from this API is the hardware address of the page that was retired.  Note
++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param cause                             Filter page addresses by cause of retirement
++ * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
++ *                                          to return the number of retired pages that match \a cause
++ *                                          Set to 0 to query the size without allocating an \a addresses buffer
++ * @param addresses                         Buffer to write the page addresses into
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
++ *                                             matching page addresses.  \a pageCount is set to the needed size.
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or
++ *                                             \a addresses is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
++    unsigned int *pageCount, unsigned long long *addresses);
++
++/**
++ * Returns the list of retired pages by source, including pages that are pending retirement
++ * The address information provided from this API is the hardware address of the page that was retired.  Note
++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
++ *
++ * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's
++ *       retirement.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param cause                             Filter page addresses by cause of retirement
++ * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
++ *                                          to return the number of retired pages that match \a cause
++ *                                          Set to 0 to query the size without allocating an \a addresses buffer
++ * @param addresses                         Buffer to write the page addresses into
++ * @param timestamps                        Buffer to write the timestamps of page retirement, additional for _v2
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
++ *                                             matching page addresses.  \a pageCount is set to the needed size.
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or
++ *                                             \a addresses is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
++    unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps);
++
++/**
++ * Check if any pages are pending retirement and need a reboot to fully retire.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                            The identifier of the target device
++ * @param isPending                         Reference in which to return the pending status
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a isPending was populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isPending is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
++
++/**
++ * Get number of remapped rows. The number of rows reported will be based on
++ * the cause of the remapping. isPending indicates whether or not there are
++ * pending remappings. A reset will be required to actually remap the row.
++ * failureOccurred will be set if a row remapping ever failed in the past. A
++ * pending remapping won't affect future work on the GPU since
++ * error-containment and dynamic page blacklisting will take care of that.
++ *
++ * @note On MIG-enabled GPUs with active instances, querying the number of
++ * remapped rows is not supported
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param corrRows                             Reference for number of rows remapped due to correctable errors
++ * @param uncRows                              Reference for number of rows remapped due to uncorrectable errors
++ * @param isPending                            Reference for whether or not remappings are pending
++ * @param failureOccurred                      Reference that is set when a remapping has failed in the past
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If MIG is enabled or if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           Unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows,
++                                               unsigned int *isPending, unsigned int *failureOccurred);
++
++/**
++ * Get the row remapper histogram. Returns the remap availability for each bank
++ * on the GPU.
++ *
++ * @param device                               Device handle
++ * @param values                               Histogram values
++ *
++ * @return
++ *        - \ref NVML_SUCCESS                  On success
++ *        - \ref NVML_ERROR_UNKNOWN            On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values);
++
++/**
++ * Get architecture for device
++ *
++ * @param device                               The identifier of the target device
++ * @param arch                                 Reference where architecture is returned, if call successful.
++ *                                             Set to NVML_DEVICE_ARCH_* upon success
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device or \a arch (output refererence) are invalid
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch);
++
++/**
++ * Retrieves the frequency monitor fault status for the device.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Requires root user.
++ *
++ * See \ref nvmlClkMonStatus_t for details on decoding the status output.
++ *
++ * @param device                               The identifier of the target device
++ * @param status                               Reference in which to return the clkmon fault status
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a status has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a status is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetClkMonStatus()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status);
++
++/**
++ * Retrieves the current utilization and process ID
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running.
++ * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
++ * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization
++ * during the last sample period. It includes the CPU timestamp at which  the samples were recorded. Individual utilization values
++ * are returned as "unsigned int" values. If no valid sample entries are found since the lastSeenTimeStamp, NVML_ERROR_NOT_FOUND
++ * is returned.
++ *
++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
++ * \a utilization set to NULL. The caller should allocate a buffer of size
++ * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
++ * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for.
++ *
++ * On successful return, the function updates \a processSamplesCount with the number of process utilization sample
++ * structures that were actually written. This may differ from a previously read value as instances are created or
++ * destroyed.
++ *
++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported.
++ *
++ * @param device                    The identifier of the target device
++ * @param utilization               Pointer to caller-supplied buffer in which guest process utilization samples are returned
++ * @param processSamplesCount       Pointer to caller-supplied array size, and returns number of processes running
++ * @param lastSeenTimeStamp         Return only samples with timestamp greater than lastSeenTimeStamp.
++
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
++                                              unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
++
++/**
++ * Retrieves the recent utilization and process ID for all running processes
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder, jpeg decoder, OFA (Optical Flow Accelerator)
++ * for all running processes. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
++ * by \a procesesUtilInfo->procUtilArray. One utilization sample structure is returned per process running, that had some non-zero utilization
++ * during the last sample period. It includes the CPU timestamp at which  the samples were recorded. Individual utilization values
++ * are returned as "unsigned int" values.
++ *
++ * The caller should allocate a buffer of size processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t). If the buffer is too small, the API will
++ * return \a NVML_ERROR_INSUFFICIENT_SIZE, with the recommended minimal buffer size at \a procesesUtilInfo->processSamplesCount. The caller should
++ * invoke the function again with the allocated buffer passed in \a procesesUtilInfo->procUtilArray, and \a procesesUtilInfo->processSamplesCount
++ * set to the number no less than the recommended value by the previous API return.
++ *
++ * On successful return, the function updates \a procesesUtilInfo->processSamplesCount with the number of process utilization info structures
++ * that were actually written. This may differ from a previously read value as instances are created or destroyed.
++ *
++ * \a procesesUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a procesesUtilInfo->lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * \a procesesUtilInfo->version is the version number of the structure nvmlProcessesUtilizationInfo_t, the caller should set the correct version
++ * number to retrieve the specific version of processes utilization information.
++ *
++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported.
++ *
++ * @param device                    The identifier of the target device
++ * @param procesesUtilInfo          Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t.
++
++ * @return
++ *         - \ref NVML_SUCCESS                          if \a procesesUtilInfo->procUtilArray has been populated
++ *         - \ref NVML_ERROR_UNINITIALIZED              if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           if \a device is invalid, or \a procesesUtilInfo is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              if the device does not support this feature
++ *         - \ref NVML_ERROR_NOT_FOUND                  if sample entries are not found
++ *         - \ref NVML_ERROR_GPU_IS_LOST                if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  if the version of \a procesesUtilInfo is invalid
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE          if \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small.
++ *                                                      The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call
++ *                                                      the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t)
++ *         - \ref NVML_ERROR_UNKNOWN                    on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, nvmlProcessesUtilizationInfo_t *procesesUtilInfo);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlUnitCommands Unit Commands
++ *  This chapter describes NVML operations that change the state of the unit. For S-class products.
++ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
++ *  error code when invoking any of these methods.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Set the LED state for the unit. The LED can be either green (0) or amber (1).
++ *
++ * For S-class products.
++ * Requires root/admin permissions.
++ *
++ * This operation takes effect immediately.
++ *
++ *
++ * <b>Current S-Class products don't provide unique LEDs for each unit. As such, both front
++ * and back LEDs will be toggled in unison regardless of which unit is specified with this command.</b>
++ *
++ * See \ref nvmlLedColor_t for available colors.
++ *
++ * @param unit                                 The identifier of the target unit
++ * @param color                                The target LED color
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the LED color has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a color is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlUnitGetLedState()
++ */
++nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlDeviceCommands Device Commands
++ *  This chapter describes NVML operations that change the state of the device.
++ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
++ *  error code when invoking any of these methods.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Set the persistence mode for the device.
++ *
++ * For all products.
++ * For Linux only.
++ * Requires root/admin permissions.
++ *
++ * The persistence mode determines whether the GPU driver software is torn down after the last client
++ * exits.
++ *
++ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
++ * persistence mode is reset to "Disabled".
++ *
++ * See \ref nvmlEnableState_t for available modes.
++ *
++ * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA
++ * memory, the given device handle will no longer be valid, and to continue to interact with this
++ * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This
++ * limitation is currently only applicable to devices that have a coherent NVLink connection to
++ * system memory.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 The target persistence mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the persistence mode was set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetPersistenceMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
++
++/**
++ * Set the compute mode for the device.
++ *
++ * For all products.
++ * Requires root/admin permissions.
++ *
++ * The compute mode determines whether a GPU can be used for compute operations and whether it can
++ * be shared across contexts.
++ *
++ * This operation takes effect immediately. Under Linux it is not persistent across reboots and
++ * always resets to "Default". Under windows it is persistent.
++ *
++ * Under windows compute mode may only be set to DEFAULT when running in WDDM
++ *
++ * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported.
++ *
++ * See \ref nvmlComputeMode_t for details on available compute modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 The target compute mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the compute mode was set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetComputeMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
++
++/**
++ * Set the ECC mode for the device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
++ * Requires root/admin permissions.
++ *
++ * The ECC mode determines whether the GPU enables its ECC support.
++ *
++ * This operation takes effect after the next reboot.
++ *
++ * See \ref nvmlEnableState_t for details on available modes.
++ *
++ * @param device                               The identifier of the target device
++ * @param ecc                                  The target ECC mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the ECC mode was set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a ecc is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetEccMode()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);
++
++/**
++ * Clear the ECC error and other memory error counts for the device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Only applicable to devices with ECC.
++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
++ * Requires root/admin permissions.
++ * Requires ECC Mode to be enabled.
++ *
++ * Sets all of the specified ECC counters to 0, including both detailed and total counts.
++ *
++ * This operation takes effect immediately.
++ *
++ * See \ref nvmlMemoryErrorType_t for details on available counter types.
++ *
++ * @param device                               The identifier of the target device
++ * @param counterType                          Flag that indicates which type of errors should be cleared.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the error counts were cleared
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a counterType is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see
++ *      - nvmlDeviceGetDetailedEccErrors()
++ *      - nvmlDeviceGetTotalEccErrors()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
++
++/**
++ * Set the driver model for the device.
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * For windows only.
++ * Requires root/admin permissions.
++ *
++ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
++ * to the device it must run in WDDM mode.
++ *
++ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
++ * This should only be done if the host is subsequently powered down and the display is detached from the device
++ * before the next reboot.
++ *
++ * This operation takes effect after the next reboot.
++ *
++ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
++ *
++ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or
++ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
++ *
++ * See \ref nvmlDriverModel_t for details on available driver models.
++ * See \ref nvmlFlagDefault and \ref nvmlFlagForce
++ *
++ * @param device                               The identifier of the target device
++ * @param driverModel                          The target driver model
++ * @param flags                                Flags that change the default behavior
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the driver model has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a driverModel is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows or the device does not support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetDriverModel()
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
++
++typedef enum nvmlClockLimitId_enum {
++    NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00,
++    NVML_CLOCK_LIMIT_ID_TDP,
++    NVML_CLOCK_LIMIT_ID_UNLIMITED
++} nvmlClockLimitId_t;
++
++/**
++ * Set clocks that device will lock to.
++ *
++ * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz.
++ * Setting this will supersede application clock values and take effect regardless if a cuda app is running.
++ * See /ref nvmlDeviceSetApplicationsClocks
++ *
++ * Can be used as a setting to request constant performance.
++ *
++ * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values.
++ * See the table below for valid combinations of these values.
++ *
++ * minGpuClock | maxGpuClock | Effect
++ * ------------+-------------+--------------------------------------------------
++ *     tdp     |     tdp     | Lock clock to TDP
++ *  unlimited  |     tdp     | Upper bound is TDP but clock may drift below this
++ *     tdp     |  unlimited  | Lower bound is TDP but clock may boost above this
++ *  unlimited  |  unlimited  | Unlocked (== nvmlDeviceResetGpuLockedClocks)
++ *
++ * If one arg takes one of these values, the other must be one of these values as
++ * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT.
++ *
++ * Requires root/admin permissions.
++ *
++ * After system reboot or driver reload applications clocks go back to their default value.
++ * See \ref nvmlDeviceResetGpuLockedClocks.
++ *
++ * For Volta &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param minGpuClockMHz                       Requested minimum gpu clock in MHz
++ * @param maxGpuClockMHz                       Requested maximum gpu clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz
++ *                                                 is not a valid clock combination
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz);
++
++/**
++ * Resets the gpu clock to the default value
++ *
++ * This is the gpu clock that will be used after system reboot or driver reload.
++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * @see nvmlDeviceSetGpuLockedClocks
++ *
++ * For Volta &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device);
++
++/**
++ * Set memory clocks that device will lock to.
++ *
++ * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz.
++ * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running.
++ * See /ref nvmlDeviceSetApplicationsClocks
++ *
++ * Can be used as a setting to request constant performance.
++ *
++ * Requires root/admin permissions.
++ *
++ * After system reboot or driver reload applications clocks go back to their default value.
++ * See \ref nvmlDeviceResetMemoryLockedClocks.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param minMemClockMHz                       Requested minimum memory clock in MHz
++ * @param maxMemClockMHz                       Requested maximum memory clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz
++ *                                                 is not a valid clock combination
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz);
++
++/**
++ * Resets the memory clock to the default value
++ *
++ * This is the memory clock that will be used after system reboot or driver reload.
++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * @see nvmlDeviceSetMemoryLockedClocks
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device);
++
++/**
++ * Set clocks that applications will lock to.
++ *
++ * Sets the clocks that compute and graphics applications will be running at.
++ * e.g. CUDA driver requests these clocks during context creation which means this property
++ * defines clocks at which CUDA applications will be running unless some overspec event
++ * occurs (e.g. over power, over thermal or external HW brake).
++ *
++ * Can be used as a setting to request constant performance.
++ *
++ * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks.
++ *
++ * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call
++ * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting
++ * above the clock value being set.
++ *
++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
++ * Requires root/admin permissions.
++ *
++ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks
++ * for details on how to list available clocks combinations.
++ *
++ * After system reboot or driver reload applications clocks go back to their default value.
++ * See \ref nvmlDeviceResetApplicationsClocks.
++ *
++ * @param device                               The identifier of the target device
++ * @param memClockMHz                          Requested memory clock in MHz
++ * @param graphicsClockMHz                     Requested graphics clock in MHz
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memClockMHz and \a graphicsClockMHz
++ *                                                 is not a valid clock combination
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
++
++/**
++ * Resets the application clock to the default value
++ *
++ * This is the applications clock that will be used after system reboot or driver reload.
++ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
++ *
++ * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks,
++ * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above
++ * base clocks as thermal limits allow.
++ *
++ * @see nvmlDeviceGetApplicationsClock
++ * @see nvmlDeviceSetApplicationsClocks
++ *
++ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if new settings were successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
++
++/**
++ * Try to set the current state of Auto Boosted clocks on a device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
++ * rates are desired.
++ *
++ * Non-root users may use this API by default but can be restricted by root from using this API by calling
++ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
++ * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled.
++ *
++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
++ * behavior.
++ *
++ * @param device                               The identifier of the target device
++ * @param enabled                              What state to try to set Auto Boosted clocks of the target device to
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 If the Auto Boosted clocks were successfully set to the state specified by \a enabled
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
++
++/**
++ * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will
++ * return to when no compute running processes (e.g. CUDA application which have an active context) are running
++ *
++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
++ * Requires root/admin permissions.
++ *
++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
++ * rates are desired.
++ *
++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
++ * behavior.
++ *
++ * @param device                               The identifier of the target device
++ * @param enabled                              What state to try to set default Auto Boosted clocks of the target device to
++ * @param flags                                Flags that change the default behavior. Currently Unused.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NO_PERMISSION     If the calling user does not have permission to change Auto Boosted clock's default state.
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
++
++/**
++ * Sets the speed of the fan control policy to default.
++ *
++ * For all cuda-capable discrete products with fans
++ *
++ * @param device                        The identifier of the target device
++ * @param fan                           The index of the fan, starting at zero
++ *
++ * return
++ *         NVML_SUCCESS                 if speed has been adjusted
++ *         NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         NVML_ERROR_INVALID_ARGUMENT  if device is invalid
++ *         NVML_ERROR_NOT_SUPPORTED     if the device does not support this
++ *                                      (doesn't have fans)
++ *         NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan);
++
++/**
++ * Sets current fan control policy.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Requires privileged user.
++ *
++ * For all cuda-capable discrete products with fans
++ *
++ * device                               The identifier of the target \a device
++ * policy                               The fan control \a policy to set
++ *
++ * return
++ *         NVML_SUCCESS                 if \a policy has been set
++ *         NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a policy is null or the \a fan given doesn't reference
++ *                                            a fan that exists.
++ *         NVML_ERROR_NOT_SUPPORTED     if the \a device is older than Maxwell
++ *         NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan,
++                                                   nvmlFanControlPolicy_t policy);
++
++/**
++ * Sets the temperature threshold for the GPU with the specified threshold type in degrees C.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
++ *
++ * @param device                               The identifier of the target device
++ * @param thresholdType                        The type of threshold value to be set
++ * @param temp                                 Reference which hold the value to be set
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a temp has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a temperature sensor or is unsupported
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp);
++
++/**
++ * Set new power limit of this device.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
++ *
++ * \note Limit is not persistent across reboots or driver unloads.
++ * Enable persistent mode to prevent driver from unloading when no application is using the device.
++ *
++ * @param device                               The identifier of the target device
++ * @param limit                                Power management limit in milliwatts to set
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a limit has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is out of range
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceGetPowerManagementLimitConstraints
++ * @see nvmlDeviceGetPowerManagementDefaultLimit
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
++
++/**
++ * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
++ *
++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
++ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
++ * Requires root/admin permissions.
++ *
++ * Changing GOMs requires a reboot.
++ * The reboot requirement might be removed in the future.
++ *
++ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
++ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 Target GOM
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a mode has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode incorrect
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support GOM or specific mode
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlGpuOperationMode_t
++ * @see nvmlDeviceGetGpuOperationMode
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
++
++/**
++ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
++ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
++ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
++ * to query the current restriction settings.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * @param device                               The identifier of the target device
++ * @param apiType                              Target API type for this operation
++ * @param isRestricted                         The target restriction
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a apiType incorrect
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support changing API restrictions or the device does not support
++ *                                                 the feature that api restrictions are being set for (E.G. Enabling/disabling auto
++ *                                                 boosted clocks is not supported by the device)
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlRestrictedAPI_t
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
++
++/**
++ * Sets the speed of a specified fan.
++ *
++ * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor
++ *          the temperature and adjust the fan speed accordingly.
++ *          If you set the fan speed too low you can burn your GPU!
++ *          Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy.
++ *
++ * For all cuda-capable discrete products with fans that are Maxwell or Newer.
++ *
++ * device                                The identifier of the target device
++ * fan                                   The index of the fan, starting at zero
++ * speed                                 The target speed of the fan [0-100] in % of max speed
++ *
++ * return
++ *        NVML_SUCCESS                   if the fan speed has been set
++ *        NVML_ERROR_UNINITIALIZED       if the library has not been successfully initialized
++ *        NVML_ERROR_INVALID_ARGUMENT    if the device is not valid, or the speed is outside acceptable ranges,
++ *                                              or if the fan index doesn't reference an actual fan.
++ *        NVML_ERROR_NOT_SUPPORTED       if the device is older than Maxwell.
++ *        NVML_ERROR_UNKNOWN             if there was an unexpected error.
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed);
++
++/**
++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works
++ *             on Maxwell onwards GPU architectures.
++ *
++ * Set the GPCCLK VF offset value
++ * @param[in]   device                         The identifier of the target device
++ * @param[in]   offset                         The GPCCLK VF offset value to set
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset);
++
++/**
++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works
++ *             on Maxwell onwards GPU architectures.
++ *
++ * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges.
++ * @param[in]   device                         The identifier of the target device
++ * @param[in]   offset                         The MemClk VF offset value to set
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a offset has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a offset is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset);
++
++/**
++ * @}
++ */
++
++/** @addtogroup nvmlAccountingStats
++ *  @{
++ */
++
++/**
++ * Enables or disables per process accounting.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * @note This setting is not persistent and will default to disabled after driver unloads.
++ *       Enable persistence mode to be sure the setting doesn't switch off to disabled.
++ *
++ * @note Enabling accounting mode has no negative impact on the GPU performance.
++ *
++ * @note Disabling accounting clears all accounting pids information.
++ *
++ * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported.
++ *
++ * See \ref nvmlDeviceGetAccountingMode
++ * See \ref nvmlDeviceGetAccountingStats
++ * See \ref nvmlDeviceClearAccountingPids
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 The target accounting mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the new mode has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a mode are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
++
++/**
++ * Clears accounting information about all processes that have already terminated.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * See \ref nvmlDeviceGetAccountingMode
++ * See \ref nvmlDeviceGetAccountingStats
++ * See \ref nvmlDeviceSetAccountingMode
++ *
++ * @param device                               The identifier of the target device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if accounting information has been cleared
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup NvLink NvLink Methods
++ * This chapter describes methods that NVML can perform on NVLINK enabled devices.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Retrieves the state of the device's NvLink for the link specified
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param isActive                             \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
++ *                                             the link is active and NVML_FEATURE_DISABLED indicates it
++ *                                             is inactive
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a isActive has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a isActive is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
++
++/**
++ * Retrieves the version of the device's NvLink for the link specified
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param version                              Requested NvLink version
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a version is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
++
++/**
++ * Retrieves the requested capability from the device's NvLink for the link specified
++ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
++ * The return value should be treated as a boolean.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param capability                           Specifies the \a nvmlNvLinkCapability_t to be queried
++ * @param capResult                            A boolean for the queried capability indicating that feature is available
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a capResult has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a capability is invalid or \a capResult is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
++                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
++
++/**
++ * Retrieves the PCI information for the remote node on a NvLink link
++ * Note: pciSubSystemId is not filled in this function and is indeterminate
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param pci                                  \a nvmlPciInfo_t of the remote node for the specified link
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a pci has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a pci is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
++
++/**
++ * Retrieves the specified error counter value
++ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param counter                              Specifies the NvLink counter to be queried
++ * @param counterValue                         Returned counter value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a counter has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
++                                                     nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
++
++/**
++ * Resets all error counters to zero
++ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the reset is successful
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
++
++/**
++ * Deprecated: Setting utilization counter control is no longer supported.
++ *
++ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition.  Performs a reset
++ * of the counters if the reset parameter is non-zero.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param counter                              Specifies the counter that should be set (0 or 1).
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to set
++ * @param reset                                Resets the counters on set if non-zero
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the control has been set successfully
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
++                                                           nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
++
++/**
++ * Deprecated: Getting utilization counter control is no longer supported.
++ *
++ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param counter                              Specifies the counter that should be set (0 or 1).
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to place information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the control has been set successfully
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
++                                                           nvmlNvLinkUtilizationControl_t *control);
++
++
++/**
++ * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead.
++ *
++ * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
++ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
++ *  before reading the utilization counters as they have no default state
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param counter                              Specifies the counter that should be read (0 or 1).
++ * @param rxcounter                            Receive counter return value
++ * @param txcounter                            Transmit counter return value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a rxcounter and \a txcounter have been successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter,
++                                                           unsigned long long *rxcounter, unsigned long long *txcounter);
++
++/**
++ * Deprecated: Freezing NVLINK utilization counters is no longer supported.
++ *
++ * Freeze the NVLINK utilization counters
++ * Both the receive and transmit counters are operated on by this function
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be queried
++ * @param counter                              Specifies the counter that should be frozen (0 or 1).
++ * @param freeze                               NVML_FEATURE_ENABLED = freeze the receive and transmit counters
++ *                                             NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully frozen or unfrozen
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, \a counter, or \a freeze is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link,
++                                            unsigned int counter, nvmlEnableState_t freeze);
++
++/**
++ * Deprecated: Resetting NVLINK utilization counters is no longer supported.
++ *
++ * Reset the NVLINK utilization counters
++ * Both the receive and transmit counters are operated on by this function
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param link                                 Specifies the NvLink link to be reset
++ * @param counter                              Specifies the counter that should be reset (0 or 1)
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully reset
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
++
++/**
++* Get the NVLink device type of the remote device connected over the given link.
++*
++* @param device                                The device handle of the target GPU
++* @param link                                  The NVLink link index on the target GPU
++* @param pNvLinkDeviceType                     Pointer in which the output remote device type is returned
++*
++* @return
++*         - \ref NVML_SUCCESS                  if \a pNvLinkDeviceType has been set
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_NOT_SUPPORTED      if NVLink is not supported
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device or \a link is invalid, or
++*                                              \a pNvLinkDeviceType is NULL
++*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is
++*                                              otherwise inaccessible
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType);
++
++/**
++ * Set NvLink Low Power Threshold for device.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param device                               The identifier of the target device
++ * @param info                                 Reference to \a nvmlNvLinkPowerThres_t struct
++ *                                             input parameters
++ *
++ * @return
++ *        - \ref NVML_SUCCESS                 if the \a Threshold is successfully set
++ *        - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *        - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a Threshold is not within range
++ *        - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *
++ **/
++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info);
++
++/**
++ * Set the global nvlink bandwith mode
++ *
++ * @param nvlinkBwMode             nvlink bandwidth mode
++ * @return
++ *         - \ref NVML_SUCCESS                on success
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided
++ *         - \ref NVML_ERROR_IN_USE           if P2P object exists
++ *         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
++ *         - \ref NVML_ERROR_NO_PERMISSION    if not root user
++ */
++nvmlReturn_t DECLDIR nvmlSystemSetNvlinkBwMode(unsigned int nvlinkBwMode);
++
++/**
++ * Get the global nvlink bandwith mode
++ *
++ * @param nvlinkBwMode             reference of nvlink bandwidth mode
++ * @return
++ *         - \ref NVML_SUCCESS                on success
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
++ *         - \ref NVML_ERROR_NOT_SUPPORTED    if GPU is not Hopper or newer architecture.
++ *         - \ref NVML_ERROR_NO_PERMISSION    if not root user
++ */
++nvmlReturn_t DECLDIR nvmlSystemGetNvlinkBwMode(unsigned int *nvlinkBwMode);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlEvents Event Handling Methods
++ * This chapter describes methods that NVML can perform against each device to register and wait for
++ * some event to occur.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Create an empty set of events.
++ * Event set should be freed by \ref nvmlEventSetFree
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * @param set                                  Reference in which to return the event handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the event has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a set is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlEventSetFree
++ */
++nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
++
++/**
++ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
++ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
++ *
++ * For Linux only.
++ *
++ * \b IMPORTANT: Operations on \a set are not thread safe
++ *
++ * This call starts recording of events on specific device.
++ * All events that occurred before this call are not recorded.
++ * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2
++ *
++ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
++ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
++ *     are registered in that case.
++ *
++ * @param device                               The identifier of the target device
++ * @param eventTypes                           Bitmask of \ref nvmlEventType to record
++ * @param set                                  Set to which add new event types
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the event has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventTypes is invalid or \a set is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform does not support this feature or some of requested event types
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlEventType
++ * @see nvmlDeviceGetSupportedEventTypes
++ * @see nvmlEventSetWait
++ * @see nvmlEventSetFree
++ */
++nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
++
++/**
++ * Returns information about events supported on device
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
++ *
++ * @param device                               The identifier of the target device
++ * @param eventTypes                           Reference in which to return bitmask of supported events
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the eventTypes has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventType is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlEventType
++ * @see nvmlDeviceRegisterEvents
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
++
++/**
++ * Waits on events and delivers events
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * If some events are ready to be delivered at the time of the call, function returns immediately.
++ * If there are no events ready to be delivered, function sleeps till event arrives
++ * but not longer than specified timeout. This function in certain conditions can return before
++ * specified timeout passes (e.g. when interrupt arrives)
++ *
++ * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system.
++ * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error
++ * type is returned for all xid error events.
++ *
++ * On Linux, every xid error event would return the associated event data and other information if applicable.
++ *
++ * In MIG mode, if device handle is provided, the API reports all the events for the available instances,
++ * only if the caller has appropriate privileges. In absence of required privileges, only the events which
++ * affect all the instances (i.e. whole device) are reported.
++ *
++ * This API does not currently support per-instance event reporting using MIG device handles.
++ *
++ * @param set                                  Reference to set of events to wait on
++ * @param data                                 Reference in which to return event data
++ * @param timeoutms                            Maximum amount of wait time in milliseconds for registered event
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the data has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a data is NULL
++ *         - \ref NVML_ERROR_TIMEOUT           if no event arrived in specified timeout or interrupt arrived
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if a GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlEventType
++ * @see nvmlDeviceRegisterEvents
++ */
++nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
++
++/**
++ * Releases events in the set
++ *
++ * For Fermi &tm; or newer fully supported devices.
++ *
++ * @param set                                  Reference to events to be released
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the event has been successfully released
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlDeviceRegisterEvents
++ */
++nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlZPI Drain states
++ * This chapter describes methods that NVML can perform against each device to control their drain state
++ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
++ * power on/off GPUs, enable robust reset scenarios, etc.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Modify the drain state of a GPU.  This method forces a GPU to no longer accept new incoming requests.
++ * Any new NVML process will no longer see this GPU.  Persistence mode for this GPU must be turned off before
++ * this call is made.
++ * Must be called as administrator.
++ * For Linux only.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ * Some Kepler devices supported.
++ *
++ * @param pciInfo                              The PCI address of the GPU drain state to be modified
++ * @param newState                             The drain state that should be entered, see \ref nvmlEnableState_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully reset
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a newState is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
++ *         - \ref NVML_ERROR_IN_USE            if the device has persistence mode turned on
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
++
++/**
++ * Query the drain state of a GPU.  This method is used to check if a GPU is in a currently draining
++ * state.
++ * For Linux only.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ * Some Kepler devices supported.
++ *
++ * @param pciInfo                              The PCI address of the GPU drain state to be queried
++ * @param currentState                         The current drain state for this GPU, see \ref nvmlEnableState_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully reset
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a currentState is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
++
++/**
++ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
++ * as long as no other processes are attached. If other processes are attached, this call will return
++ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
++ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
++ * to initiate the draining state is if that process was using, and is still using, a GPU before the
++ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
++ * prior to this call.
++ *
++ * For long-running NVML processes please note that this will change the enumeration of current GPUs.
++ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
++ * Also, device handles after the removed GPU will not be valid and must be re-established.
++ * Must be run as administrator.
++ * For Linux only.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ * Some Kepler devices supported.
++ *
++ * @param pciInfo                              The PCI address of the GPU to be removed
++ * @param gpuState                             Whether the GPU is to be removed, from the OS
++ *                                             see \ref nvmlDetachGpuState_t
++ * @param linkState                            Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully reset
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
++ *         - \ref NVML_ERROR_IN_USE            if the device is still in use and cannot be removed
++ */
++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
++
++/**
++ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
++ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.
++ * If all are zeroes then the entire PCI tree will be searched.  Please note that for long-running NVML processes
++ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
++ *
++ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
++ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
++ *
++ * Must be run as administrator.
++ * For Linux only.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ * Some Kepler devices supported.
++ *
++ * @param pciInfo                              The PCI tree to be searched.  Only the domain, bus, and device
++ *                                             fields are used in this call.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if counters were successfully reset
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pciInfo is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the operating system does not support this feature
++ *         - \ref NVML_ERROR_OPERATING_SYSTEM  if the operating system is denying this feature
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlFieldValueQueries Field Value Queries
++ *  This chapter describes NVML operations that are associated with retrieving Field Values from NVML
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Request values for a list of fields for a device. This API allows multiple fields to be queried at once.
++ * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs
++ * will be populated from a single call rather than making a driver call for each fieldId.
++ *
++ * @param device                               The device handle of the GPU to request field values for
++ * @param valuesCount                          Number of entries in values that should be retrieved
++ * @param values                               Array of \a valuesCount structures to hold field values.
++ *                                             Each value's fieldId must be populated prior to this call
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if any values in \a values were populated. Note that you must
++ *                                             check the nvmlReturn field of each value for each individual
++ *                                             status
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a values is NULL
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
++
++/**
++ * Clear values for a list of fields for a device. This API allows multiple fields to be cleared at once.
++ *
++ * @param device                               The device handle of the GPU to request field values for
++ * @param valuesCount                          Number of entries in values that should be cleared
++ * @param values                               Array of \a valuesCount structures to hold field values.
++ *                                             Each value's fieldId must be populated prior to this call
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if any values in \a values were cleared. Note that you must
++ *                                             check the nvmlReturn field of each value for each individual
++ *                                             status
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a values is NULL
++ */
++nvmlReturn_t DECLDIR nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlVirtualGpuQueries vGPU APIs
++ * This chapter describes operations that are associated with NVIDIA vGPU Software products.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * This method is used to get the virtualization mode corresponding to the GPU.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                    Identifier of the target device
++ * @param pVirtualMode              Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a pVirtualMode is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a pVirtualMode is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode);
++
++/**
++ * Queries if SR-IOV host operation is supported on a vGPU supported device.
++ *
++ * Checks whether SR-IOV host capability is supported by the device and the
++ * driver, and indicates device is in SR-IOV mode if both of these conditions
++ * are true.
++ *
++ * @param device                                The identifier of the target device
++ * @param pHostVgpuMode                         Reference in which to return the current vGPU mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if device's vGPU mode has been successfully retrieved
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device handle is 0 or \a pVgpuMode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if \a device doesn't support this feature.
++ *         - \ref NVML_ERROR_UNKNOWN            if any unexpected error occurred
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode);
++
++/**
++ * This method is used to set the virtualization mode corresponding to the GPU.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                    Identifier of the target device
++ * @param virtualMode               virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a virtualMode is set
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a virtualMode is NULL
++ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if setting of virtualization mode is not supported.
++ *         - \ref NVML_ERROR_NO_PERMISSION      if setting of virtualization mode is not allowed for this client.
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode);
++
++/**
++ * Get the vGPU heterogeneous mode for the device.
++ *
++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes.
++ *
++ * On successful return, the function returns \a pHeterogeneousMode->mode with the current vGPU heterogeneous mode.
++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should
++ * set the correct version number to retrieve the vGPU heterogeneous mode.
++ * \a pHeterogeneousMode->mode can either be \ref NVML_FEATURE_ENABLED or \ref NVML_FEATURE_DISABLED.
++ *
++ * @param device                               The identifier of the target device
++ * @param pHeterogeneousMode                   Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device is invalid or \a pHeterogeneousMode is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device doesn't support this feature
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice_t device, nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode);
++
++/**
++ * Enable or disable vGPU heterogeneous mode for the device.
++ *
++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes.
++ *
++ * API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode
++ * set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API
++ * is expected to shutdown the vGPU VMs and retry setting the \a mode.
++ * On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode.
++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should
++ * set the correct version number to set the vGPU heterogeneous mode.
++ *
++ * @param device                               Identifier of the target device
++ * @param pHeterogeneousMode                   Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid
++ *         - \ref NVML_ERROR_IN_USE                     If the \a device is in use
++ *         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              If MIG is enabled or \a device doesn't support this feature
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pHeterogeneousMode is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice_t device, const nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode);
++
++/**
++ * Query the placement ID of active vGPU instance.
++ *
++ * When in vGPU heterogeneous mode, this function returns a valid placement ID as \a pPlacement->placementId
++ * else NVML_INVALID_VGPU_PLACEMENT_ID is returned.
++ * \a pPlacement->version is the version number of the structure nvmlVgpuPlacementId_t, the caller should
++ * set the correct version number to get placement id of the vGPU instance \a vgpuInstance.
++ *
++ * @param vgpuInstance                         Identifier of the target vGPU instance
++ * @param pPlacement                           Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          If information is successfully retrieved
++ *         - \ref NVML_ERROR_NOT_FOUND                  If \a vgpuInstance does not match a valid active vGPU instance
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a vgpuInstance is invalid or \a pPlacement is NULL
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacement is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuPlacementId_t *pPlacement);
++
++/**
++ * Query the supported vGPU placement ID of the vGPU type.
++ *
++ * An array of supported vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the
++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be
++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances().
++ *
++ * This function will return supported placement IDs even if GPU is not in vGPU heterogeneous mode.
++ *
++ * @param device                               Identifier of the target device
++ * @param vgpuTypeId                           Handle to vGPU type. The vGPU type ID
++ * @param pPlacementList                       Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device or \a vgpuTypeId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList);
++
++/**
++ * Query the creatable vGPU placement ID of the vGPU type.
++ *
++ * An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the
++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be
++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances().
++ * The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the
++ * vGPU instance is running.
++ *
++ * The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode.
++ *
++ * @param device                               The identifier of the target device
++ * @param vgpuTypeId                           Handle to vGPU type. The vGPU type ID
++ * @param pPlacementList                       Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                          Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED              If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              If \a device or \a vgpuTypeId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION              If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  If the version of \a pPlacementList is invalid
++ *         - \ref NVML_ERROR_UNKNOWN                    On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList);
++
++/**
++ * Retrieve the static GSP heap size of the vGPU type in bytes
++ *
++ * @param vgpuTypeId                           Handle to vGPU type
++ * @param gspHeapSize                          Reference to return the GSP heap size value
++ * @return
++ *         - \ref NVML_SUCCESS                 Successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     If the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a vgpuTypeId is invalid, or \a gspHeapSize is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *gspHeapSize);
++
++/**
++ * Retrieve the static framebuffer reservation of the vGPU type in bytes
++ *
++ * @param vgpuTypeId                           Handle to vGPU type
++ * @param fbReservation                        Reference to return the framebuffer reservation
++ * @return
++ *         - \ref NVML_SUCCESS                 Successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     If the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a vgpuTypeId is invalid, or \a fbReservation is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           On any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbReservation);
++
++/**
++ * Set the desirable vGPU capability of a device
++ *
++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set.
++ * See \ref nvmlEnableState_t for available state.
++ *
++ * @param device                               The identifier of the target device
++ * @param capability                           Specifies the \a nvmlDeviceVgpuCapability_t to be set
++ * @param state                                The target capability mode
++ *
++ * @return
++ *      - \ref NVML_SUCCESS                    Successful completion
++ *      - \ref NVML_ERROR_UNINITIALIZED        If the library has not been successfully initialized
++ *      - \ref NVML_ERROR_INVALID_ARGUMENT     If \a device is invalid, or \a capability is invalid, or \a state is invalid
++ *      - \ref NVML_ERROR_NOT_SUPPORTED        The API is not supported in current state, or \a device not in vGPU mode
++ *      - \ref NVML_ERROR_UNKNOWN              On any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, nvmlEnableState_t state);
++
++/**
++ * Retrieve the vGPU Software licensable features.
++ *
++ * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s)
++ * and their current license status.
++ *
++ * @param device                    Identifier of the target device
++ * @param pGridLicensableFeatures   Pointer to structure in which vGPU software licensable features are returned
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if licensable features are successfully retrieved
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pGridLicensableFeatures is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlVgpu vGPU Management
++ * @{
++ *
++ * This chapter describes APIs supporting NVIDIA vGPU.
++ */
++/***************************************************************************************************/
++
++/**
++ * Retrieve the requested vGPU driver capability.
++ *
++ * Refer to the \a nvmlVgpuDriverCapability_t structure for the specific capabilities that can be queried.
++ * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability
++ * is supported.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param capability      Specifies the \a nvmlVgpuDriverCapability_t to be queried
++ * @param capResult       A boolean for the queried capability indicating that feature is supported
++ *
++ * @return
++ *      - \ref NVML_SUCCESS                      successful completion
++ *      - \ref NVML_ERROR_UNINITIALIZED          if the library has not been successfully initialized
++ *      - \ref NVML_ERROR_INVALID_ARGUMENT       if \a capability is invalid, or \a capResult is NULL
++ *      - \ref NVML_ERROR_NOT_SUPPORTED          the API is not supported in current state or \a devices not in vGPU mode
++ *      - \ref NVML_ERROR_UNKNOWN                on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int *capResult);
++
++/**
++ * Retrieve the requested vGPU capability for GPU.
++ *
++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried.
++ * The return value in \a capResult reports a non-zero value indicating that the capability
++ * is supported, and also reports the capability's data based on the queried capability.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param device     The identifier of the target device
++ * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried
++ * @param capResult  Specifies that the queried capability is supported, and also returns capability's data
++ *
++ * @return
++ *      - \ref NVML_SUCCESS                      successful completion
++ *      - \ref NVML_ERROR_UNINITIALIZED          if the library has not been successfully initialized
++ *      - \ref NVML_ERROR_INVALID_ARGUMENT       if \a device is invalid, or \a capability is invalid, or \a capResult is NULL
++ *      - \ref NVML_ERROR_NOT_SUPPORTED          the API is not supported in current state or \a device not in vGPU mode
++ *      - \ref NVML_ERROR_UNKNOWN                on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int *capResult);
++
++/**
++ * Retrieve the supported vGPU types on a physical GPU (device).
++ *
++ * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
++ * is used to return the number of vGPU types written to the buffer.
++ *
++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns
++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
++ * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
++ *
++ * @param device                   The identifier of the target device
++ * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
++ * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                      successful completion
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN                on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
++
++/**
++ * Retrieve the currently creatable vGPU types on a physical GPU (device).
++ *
++ * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
++ * is used to return the number of vGPU types written to the buffer.
++ *
++ * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types
++ * can concurrently run on a device.  For example, if only one vGPU type is allowed at a time on a device, then the creatable
++ * list will be restricted to whatever vGPU type is already running on the device.
++ *
++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns
++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
++ * To query the number of vGPU types that can be created for the GPU, call this function with *vgpuCount = 0.
++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
++ *
++ * @param device                   The identifier of the target device
++ * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
++ * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                      successful completion
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN                on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
++
++/**
++ * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator).
++ * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param vgpuTypeClass            Pointer to string array to return class in
++ * @param size                     Size of string
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   successful completion
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size);
++
++/**
++ * Retrieve the vGPU type name.
++ *
++ * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not
++ * exceed 64 characters in length (including the NUL terminator).  See \ref
++ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param vgpuTypeName             Pointer to buffer to return name
++ * @param size                     Size of buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a name is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size);
++
++/**
++ * Retrieve the GPU Instance Profile ID for the given vGPU type ID.
++ * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is
++ * returned.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param gpuInstanceProfileId     GPU Instance Profile ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if \a device is not in vGPU Host virtualization mode
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId);
++
++/**
++ * Retrieve the device ID of a vGPU type.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param deviceID                 Device ID and vendor ID of the device contained in single 32 bit value
++ * @param subsystemID              Subsystem ID and subsystem vendor ID of the device contained in single 32 bit value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID);
++
++/**
++ * Retrieve the vGPU framebuffer size in bytes.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param fbSize                   Pointer to framebuffer size in bytes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a fbSize is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize);
++
++/**
++ * Retrieve count of vGPU's supported display heads.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param numDisplayHeads          Pointer to number of display heads
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads);
++
++/**
++ * Retrieve vGPU display head's maximum supported resolution.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param displayIndex             Zero-based index of display head
++ * @param xdim                     Pointer to maximum number of pixels in X dimension
++ * @param ydim                     Pointer to maximum number of pixels in Y dimension
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex
++ *                                             is out of range.
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim);
++
++/**
++ * Retrieve license requirements for a vGPU type
++ *
++ * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form
++ * "<license name>,<version>", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license,
++ * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0".
++ *
++ * The total length of the returned string will not exceed 128 characters, including the NUL terminator.
++ * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param vgpuTypeLicenseString    Pointer to buffer to return license info
++ * @param size                     Size of \a vgpuTypeLicenseString buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size);
++
++/**
++ * Retrieve the static frame rate limit value of the vGPU type
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param frameRateLimit           Reference to return the frame rate limit value
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit);
++
++/**
++ * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                   The identifier of the target device
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param vgpuInstanceCount        Pointer to get the max number of vGPU instances
++ *                                 that can be created on a deicve for given vgpuTypeId
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid or is not supported on target device,
++ *                                             or \a vgpuInstanceCount is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount);
++
++/**
++ * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param vgpuInstanceCountPerVm   Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm);
++
++/**
++ * Retrieve the BAR1 info for given vGPU type.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuTypeId               Handle to vGPU type
++ * @param bar1Info                 Pointer to the vGPU type BAR1 information structure \a nvmlVgpuTypeBar1Info_t
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a bar1Info is NULL
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetBAR1Info(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuTypeBar1Info_t *bar1Info);
++
++/**
++ * Retrieve the active vGPU instances on a device.
++ *
++ * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
++ * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
++ * written to the buffer.
++ *
++ * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns
++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
++ * To query the number of active vGPU instances, call this function with *vgpuCount = 0.  The code will return
++ * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param device                   The identifier of the target device
++ * @param vgpuCount                Pointer which passes in the array size as well as get
++ *                                 back the number of types
++ * @param vgpuInstances            Pointer to array in which to return list of vGPU instances
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid, or \a vgpuCount is NULL
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a size is too small
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances);
++
++/**
++ * Retrieve the VM ID associated with a vGPU instance.
++ *
++ * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator).
++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
++ *
++ * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param vmId                     Pointer to caller-supplied buffer to hold VM ID
++ * @param size                     Size of buffer in bytes
++ * @param vmIdType                 Pointer to hold VM ID type
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType);
++
++/**
++ * Retrieve the UUID of a vGPU instance.
++ *
++ * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string,
++ * not exceeding 80 characters in length (including the NULL terminator).
++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param uuid                     Pointer to caller-supplied buffer to hold vGPU UUID
++ * @param size                     Size of buffer in bytes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a uuid is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size);
++
++/**
++ * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU.
++ *
++ * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version
++ * string will not exceed 80 characters in length (including the NUL terminator).
++ * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
++ *
++ * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is
++ * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the
++ * NVIDIA driver is loaded and initialized.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param version                  Caller-supplied buffer to return driver version string
++ * @param length                   Size of \a version buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a version has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length);
++
++/**
++ * Retrieve the framebuffer usage in bytes.
++ *
++ * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             The identifier of the target instance
++ * @param fbUsage                  Pointer to framebuffer usage in bytes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a fbUsage is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage);
++
++/**
++ * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2.
++ *
++ * Retrieve the current licensing state of the vGPU instance.
++ *
++ * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param licensed                 Reference to return the licensing status
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a licensed has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a licensed is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed);
++
++/**
++ * Retrieve the vGPU type of a vGPU instance.
++ *
++ * Returns the vGPU type ID of vgpu assigned to the vGPU instance.
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param vgpuTypeId               Reference to return the vgpuTypeId
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a vgpuTypeId has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a vgpuTypeId is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId);
++
++/**
++ * Retrieve the frame rate limit set for the vGPU instance.
++ *
++ * Returns the value of the frame rate limit set for the vGPU instance
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param frameRateLimit           Reference to return the frame rate limit
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a frameRateLimit has been set
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a frameRateLimit is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit);
++
++/**
++ * Retrieve the current ECC mode of vGPU instance.
++ *
++ * @param vgpuInstance            The identifier of the target vGPU instance
++ * @param eccMode                 Reference in which to return the current ECC mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the vgpuInstance's ECC mode has been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a mode is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode);
++
++/**
++ * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param encoderCapacity          Reference to an unsigned int for the encoder capacity
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a encoderQueryType is invalid
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity);
++
++/**
++ * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param encoderCapacity          Unsigned int for the encoder capacity value
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100.
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int  encoderCapacity);
++
++/**
++ * Retrieves the current encoder statistics of a vGPU Instance
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance                      Identifier of the target vGPU instance
++ * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
++ * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
++ * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount , or \a averageFps or \a averageLatency is NULL
++ *                                              or \a vgpuInstance is 0.
++ *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount,
++                                                     unsigned int *averageFps, unsigned int *averageLatency);
++
++/**
++ * Retrieves information about all active encoder sessions on a vGPU Instance.
++ *
++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
++ * written to the buffer.
++ *
++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns
++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
++ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance                      Identifier of the target vGPU instance
++ * @param sessionCount                      Reference to caller supplied array size, and returns
++ *                                          the number of sessions.
++ * @param sessionInfo                       Reference to caller supplied array in which the list
++ *                                          of session information us returned.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
++ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is
++                                                returned in \a sessionCount
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL, or \a vgpuInstance is 0.
++ *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo);
++
++/**
++* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance
++*
++* For Maxwell &tm; or newer fully supported devices.
++*
++* @param vgpuInstance                      Identifier of the target vGPU instance
++* @param fbcStats                          Reference to nvmlFBCStats_t structure containing NvFBC stats
++*
++* @return
++*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a fbcStats is NULL
++*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats);
++
++/**
++* Retrieves information about active frame buffer capture sessions on a vGPU Instance.
++*
++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
++* written to the buffer.
++*
++* If the supplied buffer is not large enough to accommodate the active session array, the function returns
++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
++* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
++*
++* For Maxwell &tm; or newer fully supported devices.
++*
++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
++*       be zero if there are no new frames captured since the session started.
++*
++* @param vgpuInstance                      Identifier of the target vGPU instance
++* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
++* @param sessionInfo                       Reference in which to return the session information
++*
++* @return
++*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a sessionCount is NULL.
++*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
++*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
++
++/**
++* Retrieve the GPU Instance ID for the given vGPU Instance.
++* The API will return a valid GPU Instance ID for MIG backed vGPU Instance, else INVALID_GPU_INSTANCE_ID is returned.
++*
++* For Kepler &tm; or newer fully supported devices.
++*
++* @param vgpuInstance                      Identifier of the target vGPU instance
++* @param gpuInstanceId                     GPU Instance ID
++*
++* @return
++*         - \ref NVML_SUCCESS                  successful completion
++*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a gpuInstanceId is NULL.
++*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
++*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int *gpuInstanceId);
++
++/**
++* Retrieves the PCI Id of the given vGPU Instance i.e. the PCI Id of the GPU as seen inside the VM.
++*
++* The vGPU PCI id is returned as "00000000:00:00.0" if NVIDIA driver is not installed on the vGPU instance.
++*
++* @param vgpuInstance                         Identifier of the target vGPU instance
++* @param vgpuPciId                            Caller-supplied buffer to return vGPU PCI Id string
++* @param length                               Size of the vgpuPciId buffer
++*
++* @return
++*         - \ref NVML_SUCCESS                 if vGPU PCI Id is sucessfully retrieved
++*         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a vgpuPciId is NULL
++*         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++*         - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance
++*         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small, \a length is set to required length
++*         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length);
++
++/**
++* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure
++* for the specific capabilities that can be queried. The return value in \a capResult should be treated as
++* a boolean, with a non-zero value indicating that the capability is supported.
++*
++* For Maxwell &tm; or newer fully supported devices.
++*
++* @param vgpuTypeId                           Handle to vGPU type
++* @param capability                           Specifies the \a nvmlVgpuCapability_t to be queried
++* @param capResult                            A boolean for the queried capability indicating that feature is supported
++*
++* @return
++*         - \ref NVML_SUCCESS                 successful completion
++*         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++*         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL
++*         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++*/
++nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult);
++
++/**
++ * Retrieve the MDEV UUID of a vGPU instance.
++ *
++ * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string,
++ * not exceeding 80 characters in length (including the NULL terminator).
++ * MDEV UUID is displayed only on KVM platform.
++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance             Identifier of the target vGPU instance
++ * @param mdevUuid                 Pointer to caller-supplied buffer to hold MDEV UUID
++ * @param size                     Size of buffer in bytes
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 successful completion
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     on any hypervisor other than KVM
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a mdevUuid is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvml vGPU Migration
++ * This chapter describes operations that are associated with vGPU Migration.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Structure representing range of vGPU versions.
++ */
++typedef struct nvmlVgpuVersion_st
++{
++    unsigned int minVersion; //!< Minimum vGPU version.
++    unsigned int maxVersion; //!< Maximum vGPU version.
++} nvmlVgpuVersion_t;
++
++/**
++ * vGPU metadata structure.
++ */
++typedef struct nvmlVgpuMetadata_st
++{
++    unsigned int             version;                                                    //!< Current version of the structure
++    unsigned int             revision;                                                   //!< Current revision of the structure
++    nvmlVgpuGuestInfoState_t guestInfoState;                                             //!< Current state of Guest-dependent fields
++    char                     guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest
++    char                     hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Version of driver installed in host
++    unsigned int             reserved[6];                                                //!< Reserved for internal use
++    unsigned int             vgpuVirtualizationCaps;                                     //!< vGPU virtualization capabilities bitfield
++    unsigned int             guestVgpuVersion;                                           //!< vGPU version of guest driver
++    unsigned int             opaqueDataSize;                                             //!< Size of opaque data field in bytes
++    char                     opaqueData[4];                                              //!< Opaque data
++} nvmlVgpuMetadata_t;
++
++/**
++ * Physical GPU metadata structure
++ */
++typedef struct nvmlVgpuPgpuMetadata_st
++{
++    unsigned int            version;                                                    //!< Current version of the structure
++    unsigned int            revision;                                                   //!< Current revision of the structure
++    char                    hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Host driver version
++    unsigned int            pgpuVirtualizationCaps;                                     //!< Pgpu virtualization capabilities bitfield
++    unsigned int            reserved[5];                                                //!< Reserved for internal use
++    nvmlVgpuVersion_t       hostSupportedVgpuRange;                                     //!< vGPU version range supported by host driver
++    unsigned int            opaqueDataSize;                                             //!< Size of opaque data field in bytes
++    char                    opaqueData[4];                                              //!< Opaque data
++} nvmlVgpuPgpuMetadata_t;
++
++/**
++ * vGPU VM compatibility codes
++ */
++typedef enum nvmlVgpuVmCompatibility_enum
++{
++    NVML_VGPU_VM_COMPATIBILITY_NONE         = 0x0,    //!< vGPU is not runnable
++    NVML_VGPU_VM_COMPATIBILITY_COLD         = 0x1,    //!< vGPU is runnable from a cold / powered-off state (ACPI S5)
++    NVML_VGPU_VM_COMPATIBILITY_HIBERNATE    = 0x2,    //!< vGPU is runnable from a hibernated state (ACPI S4)
++    NVML_VGPU_VM_COMPATIBILITY_SLEEP        = 0x4,    //!< vGPU is runnable from a sleeped state (ACPI S3)
++    NVML_VGPU_VM_COMPATIBILITY_LIVE         = 0x8     //!< vGPU is runnable from a live/paused (ACPI S0)
++} nvmlVgpuVmCompatibility_t;
++
++/**
++ *  vGPU-pGPU compatibility limit codes
++ */
++typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum
++{
++    NVML_VGPU_COMPATIBILITY_LIMIT_NONE          = 0x0,           //!< Compatibility is not limited.
++    NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER   = 0x1,           //!< ompatibility is limited by host driver version.
++    NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER  = 0x2,           //!< Compatibility is limited by guest driver version.
++    NVML_VGPU_COMPATIBILITY_LIMIT_GPU           = 0x4,           //!< Compatibility is limited by GPU hardware.
++    NVML_VGPU_COMPATIBILITY_LIMIT_OTHER         = 0x80000000     //!< Compatibility is limited by an undefined factor.
++} nvmlVgpuPgpuCompatibilityLimitCode_t;
++
++/**
++ * vGPU-pGPU compatibility structure
++ */
++typedef struct nvmlVgpuPgpuCompatibility_st
++{
++    nvmlVgpuVmCompatibility_t               vgpuVmCompatibility;    //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t
++    nvmlVgpuPgpuCompatibilityLimitCode_t    compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t
++} nvmlVgpuPgpuCompatibility_t;
++
++/**
++ * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM
++ * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section
++ * containing internal state.
++ *
++ * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are
++ * dependent on information obtained from the guest VM, which may not yet have reached a state where that information
++ * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field.
++ *
++ * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide
++ * it to Virtual GPU Manager when creating a vGPU for subsequent instances of the VM.
++ *
++ * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure
++ * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
++ * in \a bufferSize.
++ *
++ * @param vgpuInstance             vGPU instance handle
++ * @param vgpuMetadata             Pointer to caller-supplied buffer into which vGPU metadata is written
++ * @param bufferSize               Size of vgpuMetadata buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   vgpuMetadata buffer is too small, required size is returned in \a bufferSize
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0.
++ *         - \ref NVML_ERROR_NOT_FOUND           if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize);
++
++/**
++ * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about
++ * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section
++ * containing internal state.
++ *
++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata
++ * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
++ * in \a bufferSize.
++ *
++ * @param device                The identifier of the target device
++ * @param pgpuMetadata          Pointer to caller-supplied buffer into which \a pgpuMetadata is written
++ * @param bufferSize            Pointer to size of \a pgpuMetadata buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   GPU metadata structure was successfully returned
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   pgpuMetadata buffer is too small, required size is returned in \a bufferSize
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
++ *         - \ref NVML_ERROR_NOT_SUPPORTED       vGPU is not supported by the system
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize);
++
++/**
++ * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a
++ * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the
++ * physical GPU.
++ *
++ * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
++ * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
++ * with the physical GPU is limited, a limit code indicates the factor limiting compatability.
++ * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
++ *
++ * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to
++ *       boot a given vGPU or associated VM.
++ *
++ * @param vgpuMetadata          Pointer to caller-supplied vGPU metadata structure
++ * @param pgpuMetadata          Pointer to caller-supplied GPU metadata structure
++ * @param compatibilityInfo     Pointer to caller-supplied buffer to hold compatibility info
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo);
++
++/**
++ * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format.
++ *
++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the
++ * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
++ * in \a bufferSize.
++ *
++ * @param device                The identifier of the target device
++ * @param pgpuMetadata          Pointer to caller-supplied buffer into which \a pgpuMetadata is written
++ * @param bufferSize            Pointer to size of \a pgpuMetadata buffer
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   GPU metadata structure was successfully returned
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
++ *         - \ref NVML_ERROR_NOT_SUPPORTED       if vGPU is not supported by the system
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize);
++
++/**
++ * Returns the vGPU Software scheduler logs.
++ * \a pSchedulerLog points to a caller-allocated structure to contain the logs. The number of elements returned will
++ * never exceed \a NVML_SCHEDULER_SW_MAX_LOG_ENTRIES.
++ *
++ * To get the entire logs, call the function atleast 5 times a second.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                The identifier of the target \a device
++ * @param pSchedulerLog         Reference in which \a pSchedulerLog is written
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   vGPU scheduler logs were successfully obtained
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a pSchedulerLog is NULL or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED       The API is not supported in current state or \a device not in vGPU host mode
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t *pSchedulerLog);
++
++/**
++ * Returns the vGPU scheduler state.
++ * The information returned in \a nvmlVgpuSchedulerGetState_t is not relevant if the BEST EFFORT policy is set.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                The identifier of the target \a device
++ * @param pSchedulerState       Reference in which \a pSchedulerState is returned
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   vGPU scheduler state is successfully obtained
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a pSchedulerState is NULL or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED       The API is not supported in current state or \a device not in vGPU host mode
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState);
++
++/**
++ * Returns the vGPU scheduler capabilities.
++ * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from
++ * the NVML_VGPU_SCHEDULER_POLICY_*. This list enumerates the supported scheduler policies
++ * if the engine is Graphics type.
++ * The other values in \a nvmlVgpuSchedulerCapabilities_t are also applicable if the engine is
++ * Graphics type. For other engine types, it is BEST EFFORT policy.
++ * If ARR is supported and enabled, scheduling frequency and averaging factor are applicable
++ * else timeSlice is applicable.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * @param device                The identifier of the target \a device
++ * @param pCapabilities         Reference in which \a pCapabilities is written
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                   vGPU scheduler capabilities were successfully obtained
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a pCapabilities is NULL or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED       The API is not supported in current state or \a device not in vGPU host mode
++ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities);
++
++/**
++ * Sets the vGPU scheduler state.
++ *
++ * For Pascal &tm; or newer fully supported devices.
++ *
++ * The scheduler state change won't persist across module load/unload.
++ * Scheduler state and params will be allowed to set only when no VM is running.
++ * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode is enabled then
++ * provide avgFactorForARR and frequency as input. If enableARRMode is disabled
++ * then provide timeslice as input.
++ *
++ * @param device                The identifier of the target \a device
++ * @param pSchedulerState       vGPU \a pSchedulerState to set
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  vGPU scheduler state has been successfully set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a pSchedulerState is NULL or \a device is invalid
++ *         - \ref NVML_ERROR_RESET_REQUIRED     if setting \a pSchedulerState failed with fatal error,
++ *                                              reboot is required to overcome from this error.
++ *         - \ref NVML_ERROR_NOT_SUPPORTED      The API is not supported in current state or \a device not in vGPU host mode
++ *                                              or if any vGPU instance currently exists on the \a device
++ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState);
++
++/*
++ * Virtual GPU (vGPU) version
++ *
++ * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that
++ * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager
++ * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM.
++ *
++ * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver
++ * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration.
++ */
++
++/**
++ * Query the ranges of supported vGPU versions.
++ *
++ * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator.
++ * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same.
++ *
++ * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges:
++ * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager.
++ * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range.
++ *
++ * @param supported  Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written
++ * @param current    Pointer to the structure in which the range of supported vGPU versions set by an administrator is written
++ *
++ * @return
++ * - \ref NVML_SUCCESS                 The vGPU version range structures were successfully obtained.
++ * - \ref NVML_ERROR_NOT_SUPPORTED     The API is not supported.
++ * - \ref NVML_ERROR_INVALID_ARGUMENT  The \a supported parameter or the \a current parameter is NULL.
++ * - \ref NVML_ERROR_UNKNOWN           An error occurred while the data was being fetched.
++ */
++nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current);
++
++/**
++ * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator.
++ *
++ * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the
++ * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to
++ * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported.
++ *
++ * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports.
++ *
++ * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager.
++ *
++ * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned.
++ *       2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load.
++ *       3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated
++ *          vGPU version that is the maximum value in the overlapping range.
++ *       4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails.
++ *
++ * @param vgpuVersion   Pointer to a caller-supplied range of supported vGPU versions.
++ *
++ * @return
++ * - \ref NVML_SUCCESS                 The preset range of supported vGPU versions was successfully overridden.
++ * - \ref NVML_ERROR_NOT_SUPPORTED     The API is not supported.
++ * - \ref NVML_ERROR_IN_USE            The range was not overridden because a VM is running on the host.
++ * - \ref NVML_ERROR_INVALID_ARGUMENT  The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL.
++ */
++nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlUtil vGPU Utilization and Accounting
++ * This chapter describes operations that are associated with vGPU Utilization and Accounting.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Retrieves current utilization for vGPUs on a physical GPU (device).
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running
++ * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer
++ * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the
++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
++ * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
++ * indicate the returned value type.
++ *
++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
++ * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
++ * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with
++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the
++ * buffer is sized for.
++ *
++ * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample
++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
++ * destroyed.
++ *
++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * @param device                        The identifier for the target device
++ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
++ * @param sampleValType                 Pointer to caller-supplied buffer to hold the type of returned sample values
++ * @param vgpuInstanceSamplesCount      Pointer to caller-supplied array size, and returns number of vGPU instances
++ * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU utilization samples are returned
++
++ * @return
++ *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is
++ *                                             NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all
++ *                                             vGPU instances currently executing on the device
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
++                                                  nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount,
++                                                  nvmlVgpuInstanceUtilizationSample_t *utilizationSamples);
++
++/**
++ * Retrieves recent utilization for vGPU instances running on a physical GPU (device).
++ *
++ * For Kepler &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for vGPU
++ * instances running on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied
++ * buffer pointed at by \a vgpuUtilInfo->vgpuUtilArray. One utilization sample structure is returned per vGPU instance, and includes the
++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
++ * in nvmlValue_t unions. The function sets the caller-supplied \a vgpuUtilInfo->sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
++ * indicate the returned value type.
++ *
++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
++ * \a vgpuUtilInfo->vgpuUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
++ * count in \a vgpuUtilInfo->vgpuInstanceCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
++ * a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t). Invoke the function again with
++ * the allocated buffer passed in \a vgpuUtilInfo->vgpuUtilArray, and \a vgpuUtilInfo->vgpuInstanceCount set to the number of entries the
++ * buffer is sized for.
++ *
++ * On successful return, the function updates \a vgpuUtilInfo->vgpuInstanceCount with the number of vGPU utilization sample
++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
++ * destroyed.
++ *
++ * \a vgpuUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a vgpuUtilInfo->lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * @param device                        The identifier for the target device
++ * @param vgpuUtilInfo                  Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t
++
++ * @return
++ *         - \ref NVML_SUCCESS                          if utilization samples are successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED              if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           if \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST                if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  if the version of \a vgpuUtilInfo is invalid
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE          if \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small.
++ *                                                      The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call
++ *                                                      the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t)
++ *         - \ref NVML_ERROR_NOT_FOUND                  if sample entries are not found
++ *         - \ref NVML_ERROR_UNKNOWN                    on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice_t device,
++                                                               nvmlVgpuInstancesUtilizationInfo_t *vgpuUtilInfo);
++
++/**
++ * Retrieves current utilization for processes running on vGPUs on a physical GPU (device).
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on
++ * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the
++ * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running
++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
++ *
++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
++ * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size
++ * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with
++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the
++ * buffer is sized for.
++ *
++ * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample
++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
++ * in any given sample period.
++ *
++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * @param device                        The identifier for the target device
++ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
++ * @param vgpuProcessSamplesCount       Pointer to caller-supplied array size, and returns number of processes running on vGPU instances
++ * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned
++
++ * @return
++ *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is
++ *                                             passed with a non-NULL \a utilizationSamples
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all
++ *                                             vGPU instances currently executing on the device
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
++                                                         unsigned int *vgpuProcessSamplesCount,
++                                                         nvmlVgpuProcessUtilizationSample_t *utilizationSamples);
++
++/**
++ * Retrieves recent utilization for processes running on vGPU instances on a physical GPU (device).
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for processes running
++ * on vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied
++ * buffer pointed at by \a vgpuProcUtilInfo->vgpuProcUtilArray. One utilization sample structure is returned per process running
++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
++ *
++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
++ * \a vgpuProcUtilInfo->vgpuProcUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current processes' count
++ * running on vGPU instances in \a vgpuProcUtilInfo->vgpuProcessCount. The caller should allocate a buffer of size
++ * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
++ * in \a vgpuProcUtilInfo->vgpuProcUtilArray, and \a vgpuProcUtilInfo->vgpuProcessCount set to the number of entries the buffer is sized for.
++ *
++ * On successful return, the function updates \a vgpuProcUtilInfo->vgpuProcessCount with the number of vGPU sub process utilization sample
++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
++ * in any given sample period.
++ *
++ * vgpuProcUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set vgpuProcUtilInfo->lastSeenTimeStamp
++ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
++ *
++ * @param device                        The identifier for the target device
++ * @param vgpuProcUtilInfo              Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t
++
++ * @return
++ *         - \ref NVML_SUCCESS                          if utilization samples are successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED              if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT           if \a device is invalid, or \a vgpuProcUtilInfo is null
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH  if the version of \a vgpuProcUtilInfo is invalid
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE          if \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount
++ *                                                      is too small to return samples for all processes on vGPU instances currently executing on the device.
++ *                                                      The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount,
++ *                                                      and call the function again with a buffer of size
++ *                                                      vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t)
++ *         - \ref NVML_ERROR_NOT_SUPPORTED              if vGPU is not supported by the device
++ *         - \ref NVML_ERROR_GPU_IS_LOST                if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_NOT_FOUND                  if sample entries are not found
++ *         - \ref NVML_ERROR_UNKNOWN                    on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice_t device, nvmlVgpuProcessesUtilizationInfo_t *vgpuProcUtilInfo);
++
++/**
++ * Queries the state of per process accounting mode on vGPU.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance            The identifier of the target vGPU instance
++ * @param mode                    Reference in which to return the current accounting mode
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a mode is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature
++ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode);
++
++/**
++ * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes
++ * returned can be in running or terminated state.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * To just query the maximum number of processes that can be queried, call this function with *count = 0 and
++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
++ *
++ * For more details see \ref nvmlVgpuInstanceGetAccountingStats.
++ *
++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
++ *
++ * @param vgpuInstance            The identifier of the target vGPU instance
++ * @param count                   Reference in which to provide the \a pids array size, and
++ *                                to return the number of elements ready to be queried
++ * @param pids                    Reference in which to return list of process ids
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a count is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
++ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value)
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ *
++ * @see nvmlVgpuInstanceGetAccountingPids
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids);
++
++/**
++ * Queries process's accounting stats.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and
++ * can be queried during life time of the process or after its termination.
++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and
++ * updated to actual running time after its termination.
++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
++ * processes.
++ *
++ * See \ref nvmlAccountingStats_t for description of each returned metric.
++ * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids.
++ *
++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode.
++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
++ *         queried since they don't contribute to GPU utilization.
++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
++ *
++ * @param vgpuInstance            The identifier of the target vGPU instance
++ * @param pid                     Process Id of the target process to query stats for
++ * @param stats                   Reference in which to return the process's accounting stats
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a stats is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *                                             or \a stats is not found
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats);
++
++/**
++ * Clears accounting information of the vGPU instance that have already terminated.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ * Requires root/admin permissions.
++ *
++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode.
++ * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications
++ *         stats don't contribute to GPU utilization.
++ *
++ * @param vgpuInstance            The identifier of the target vGPU instance
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if accounting information has been cleared
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is invalid
++ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance);
++
++/**
++ * Query the license information of the vGPU instance.
++ *
++ * For Maxwell &tm; or newer fully supported devices.
++ *
++ * @param vgpuInstance              Identifier of the target vGPU instance
++ * @param licenseInfo               Pointer to vGPU license information structure
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if information is successfully retrieved
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a licenseInfo is NULL
++ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
++ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo);
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlExcludedGpuQueries Excluded GPU Queries
++ * This chapter describes NVML operations that are associated with excluded GPUs.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Excluded GPU device information
++ **/
++typedef struct nvmlExcludedDeviceInfo_st
++{
++    nvmlPciInfo_t pciInfo;                   //!< The PCI information for the excluded GPU
++    char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the excluded GPU
++} nvmlExcludedDeviceInfo_t;
++
++ /**
++ * Retrieves the number of excluded GPU devices in the system.
++ *
++ * For all products.
++ *
++ * @param deviceCount                          Reference in which to return the number of excluded devices
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
++ */
++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceCount(unsigned int *deviceCount);
++
++/**
++ * Acquire the device information for an excluded GPU device, based on its index.
++ *
++ * For all products.
++ *
++ * Valid indices are derived from the \a deviceCount returned by
++ *   \ref nvmlGetExcludedDeviceCount(). For example, if \a deviceCount is 2 the valid indices
++ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
++ *
++ * @param index                                The index of the target GPU, >= 0 and < \a deviceCount
++ * @param info                                 Reference in which to return the device information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                  if \a device has been set
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a info is NULL
++ *
++ * @see nvmlGetExcludedDeviceCount
++ */
++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t *info);
++
++/** @} */
++
++/***************************************************************************************************/
++/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management
++ * This chapter describes NVML operations that are associated with Multi Instance GPU management.
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Disable Multi Instance GPU mode.
++ */
++#define NVML_DEVICE_MIG_DISABLE 0x0
++
++/**
++ * Enable Multi Instance GPU mode.
++ */
++#define NVML_DEVICE_MIG_ENABLE 0x1
++
++/**
++ * GPU instance profiles.
++ *
++ * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the
++ * detailed information about a GPU instance such as profile ID, engine counts.
++ */
++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE      0x0
++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE      0x1
++#define NVML_GPU_INSTANCE_PROFILE_3_SLICE      0x2
++#define NVML_GPU_INSTANCE_PROFILE_4_SLICE      0x3
++#define NVML_GPU_INSTANCE_PROFILE_7_SLICE      0x4
++#define NVML_GPU_INSTANCE_PROFILE_8_SLICE      0x5
++#define NVML_GPU_INSTANCE_PROFILE_6_SLICE      0x6
++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7
++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8
++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9
++#define NVML_GPU_INSTANCE_PROFILE_COUNT        0xA
++
++/**
++ * MIG GPU instance profile capability.
++ *
++ * Bit field values representing MIG profile capabilities
++ * \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities
++ */
++#define NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P     0x1
++
++/**
++ * MIG compute instance profile capability.
++ *
++ * Bit field values representing MIG profile capabilities
++ * \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities
++ */
++/* No capabilities for compute profiles currently exposed */
++
++typedef struct nvmlGpuInstancePlacement_st
++{
++    unsigned int start;               //!< Index of first occupied memory slice
++    unsigned int size;                //!< Number of memory slices occupied
++} nvmlGpuInstancePlacement_t;
++
++/**
++ * GPU instance profile information.
++ */
++typedef struct nvmlGpuInstanceProfileInfo_st
++{
++    unsigned int id;                  //!< Unique profile ID within the device
++    unsigned int isP2pSupported;      //!< Peer-to-Peer support
++    unsigned int sliceCount;          //!< GPU Slice count
++    unsigned int instanceCount;       //!< GPU instance count
++    unsigned int multiprocessorCount; //!< Streaming Multiprocessor count
++    unsigned int copyEngineCount;     //!< Copy Engine count
++    unsigned int decoderCount;        //!< Decoder Engine count
++    unsigned int encoderCount;        //!< Encoder Engine count
++    unsigned int jpegCount;           //!< JPEG Engine count
++    unsigned int ofaCount;            //!< OFA Engine count
++    unsigned long long memorySizeMB;  //!< Memory size in MBytes
++} nvmlGpuInstanceProfileInfo_t;
++
++/**
++ * GPU instance profile information (v2).
++ *
++ * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field
++ * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name
++ * field to the end. This structure is not backwards-compatible with
++ * \ref nvmlGpuInstanceProfileInfo_t.
++ */
++typedef struct nvmlGpuInstanceProfileInfo_v2_st
++{
++    unsigned int version;                       //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2)
++    unsigned int id;                            //!< Unique profile ID within the device
++    unsigned int isP2pSupported;                //!< Peer-to-Peer support
++    unsigned int sliceCount;                    //!< GPU Slice count
++    unsigned int instanceCount;                 //!< GPU instance count
++    unsigned int multiprocessorCount;           //!< Streaming Multiprocessor count
++    unsigned int copyEngineCount;               //!< Copy Engine count
++    unsigned int decoderCount;                  //!< Decoder Engine count
++    unsigned int encoderCount;                  //!< Encoder Engine count
++    unsigned int jpegCount;                     //!< JPEG Engine count
++    unsigned int ofaCount;                      //!< OFA Engine count
++    unsigned long long memorySizeMB;            //!< Memory size in MBytes
++    char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name
++} nvmlGpuInstanceProfileInfo_v2_t;
++
++/**
++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version.
++ */
++#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2)
++
++/**
++ * GPU instance profile information (v3).
++ *
++ * Version 3 removes isP2pSupported field and adds the \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities
++ * field \ref nvmlGpuInstanceProfileInfo_t.
++ */
++typedef struct nvmlGpuInstanceProfileInfo_v3_st
++{
++    unsigned int version;                       //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v3)
++    unsigned int id;                            //!< Unique profile ID within the device
++    unsigned int sliceCount;                    //!< GPU Slice count
++    unsigned int instanceCount;                 //!< GPU instance count
++    unsigned int multiprocessorCount;           //!< Streaming Multiprocessor count
++    unsigned int copyEngineCount;               //!< Copy Engine count
++    unsigned int decoderCount;                  //!< Decoder Engine count
++    unsigned int encoderCount;                  //!< Encoder Engine count
++    unsigned int jpegCount;                     //!< JPEG Engine count
++    unsigned int ofaCount;                      //!< OFA Engine count
++    unsigned long long memorySizeMB;            //!< Memory size in MBytes
++    char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name
++    unsigned int capabilities;                  //!< Additional capabilities
++} nvmlGpuInstanceProfileInfo_v3_t;
++
++/**
++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v3_t.version.
++ */
++#define nvmlGpuInstanceProfileInfo_v3 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 3)
++
++typedef struct nvmlGpuInstanceInfo_st
++{
++    nvmlDevice_t device;                      //!< Parent device
++    unsigned int id;                          //!< Unique instance ID within the device
++    unsigned int profileId;                   //!< Unique profile ID within the device
++    nvmlGpuInstancePlacement_t placement;     //!< Placement for this instance
++} nvmlGpuInstanceInfo_t;
++
++typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t;
++
++/**
++ * Compute instance profiles.
++ *
++ * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the
++ * detailed information about a compute instance such as profile ID, engine counts
++ */
++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE       0x0
++#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE       0x1
++#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE       0x2
++#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE       0x3
++#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE       0x4
++#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE       0x5
++#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE       0x6
++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1  0x7
++#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT         0x8
++
++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared
++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT  0x1
++
++typedef struct nvmlComputeInstancePlacement_st
++{
++    unsigned int start;                 //!< Index of first occupied compute slice
++    unsigned int size;                  //!< Number of compute slices occupied
++} nvmlComputeInstancePlacement_t;
++
++/**
++ * Compute instance profile information.
++ */
++typedef struct nvmlComputeInstanceProfileInfo_st
++{
++    unsigned int id;                    //!< Unique profile ID within the GPU instance
++    unsigned int sliceCount;            //!< GPU Slice count
++    unsigned int instanceCount;         //!< Compute instance count
++    unsigned int multiprocessorCount;   //!< Streaming Multiprocessor count
++    unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count
++    unsigned int sharedDecoderCount;    //!< Shared Decoder Engine count
++    unsigned int sharedEncoderCount;    //!< Shared Encoder Engine count
++    unsigned int sharedJpegCount;       //!< Shared JPEG Engine count
++    unsigned int sharedOfaCount;        //!< Shared OFA Engine count
++} nvmlComputeInstanceProfileInfo_t;
++
++/**
++ * Compute instance profile information (v2).
++ *
++ * Version 2 adds the \ref nvmlComputeInstanceProfileInfo_v2_t.version field
++ * to the start of the structure, and the \ref nvmlComputeInstanceProfileInfo_v2_t.name
++ * field to the end. This structure is not backwards-compatible with
++ * \ref nvmlComputeInstanceProfileInfo_t.
++ */
++typedef struct nvmlComputeInstanceProfileInfo_v2_st
++{
++    unsigned int version;                       //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v2)
++    unsigned int id;                            //!< Unique profile ID within the GPU instance
++    unsigned int sliceCount;                    //!< GPU Slice count
++    unsigned int instanceCount;                 //!< Compute instance count
++    unsigned int multiprocessorCount;           //!< Streaming Multiprocessor count
++    unsigned int sharedCopyEngineCount;         //!< Shared Copy Engine count
++    unsigned int sharedDecoderCount;            //!< Shared Decoder Engine count
++    unsigned int sharedEncoderCount;            //!< Shared Encoder Engine count
++    unsigned int sharedJpegCount;               //!< Shared JPEG Engine count
++    unsigned int sharedOfaCount;                //!< Shared OFA Engine count
++    char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name
++} nvmlComputeInstanceProfileInfo_v2_t;
++
++/**
++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v2_t.version.
++ */
++#define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2)
++
++/**
++ * Compute instance profile information (v3).
++ *
++ * Version 3 adds the \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities field
++ * \ref nvmlComputeInstanceProfileInfo_t.
++ */
++typedef struct nvmlComputeInstanceProfileInfo_v3_st
++{
++    unsigned int version;                       //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v3)
++    unsigned int id;                            //!< Unique profile ID within the GPU instance
++    unsigned int sliceCount;                    //!< GPU Slice count
++    unsigned int instanceCount;                 //!< Compute instance count
++    unsigned int multiprocessorCount;           //!< Streaming Multiprocessor count
++    unsigned int sharedCopyEngineCount;         //!< Shared Copy Engine count
++    unsigned int sharedDecoderCount;            //!< Shared Decoder Engine count
++    unsigned int sharedEncoderCount;            //!< Shared Encoder Engine count
++    unsigned int sharedJpegCount;               //!< Shared JPEG Engine count
++    unsigned int sharedOfaCount;                //!< Shared OFA Engine count
++    char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name
++    unsigned int capabilities;                  //!< Additional capabilities
++} nvmlComputeInstanceProfileInfo_v3_t;
++
++/**
++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v3_t.version.
++ */
++#define nvmlComputeInstanceProfileInfo_v3 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 3)
++
++typedef struct nvmlComputeInstanceInfo_st
++{
++    nvmlDevice_t device;                      //!< Parent device
++    nvmlGpuInstance_t gpuInstance;            //!< Parent GPU instance
++    unsigned int id;                          //!< Unique instance ID within the GPU instance
++    unsigned int profileId;                   //!< Unique profile ID within the GPU instance
++    nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount}
++} nvmlComputeInstanceInfo_t;
++
++typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t;
++
++/**
++ * Set MIG mode for the device.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Requires root user.
++ *
++ * This mode determines whether a GPU instance can be created.
++ *
++ * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the
++ * device, such as minor number, might change. The caller of this API is expected to query such attributes again.
++ *
++ * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM
++ * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases.
++ *
++ * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device
++ * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API
++ * is expected to idle the device and retry setting the \a mode.
++ *
++ * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref
++ *       NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API.
++ *
++ * @param device                               The identifier of the target device
++ * @param mode                                 The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or
++ *                                             \ref NVML_DEVICE_MIG_ENABLE
++ * @param activationStatus                     The activationStatus status
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device,\a mode or \a activationStatus are invalid
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support MIG mode
++ */
++nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus);
++
++/**
++ * Get MIG mode for the device.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ *
++ * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the
++ * next activation trigger.
++ *
++ * @param device                               The identifier of the target device
++ * @param currentMode                          Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or
++ *                                             \ref NVML_DEVICE_MIG_ENABLE
++ * @param pendingMode                          Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or
++ *                                             \ref NVML_DEVICE_MIG_ENABLE
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a currentMode or \a pendingMode are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support MIG mode
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode);
++
++/**
++ * Get GPU instance profile information
++ *
++ * Information provided by this API is immutable throughout the lifetime of a MIG mode.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ * @param profile                              One of the NVML_GPU_INSTANCE_PROFILE_*
++ * @param info                                 Returns detailed profile information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a profile or \a info are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support MIG or \a profile isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile,
++                                                         nvmlGpuInstanceProfileInfo_t *info);
++
++/**
++ * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned
++ * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure.
++ *
++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the
++ * appropriate version prior to calling this function. For example:
++ * \code
++ *     nvmlGpuInstanceProfileInfo_v2_t profileInfo =
++ *         { .version = nvmlGpuInstanceProfileInfo_v2 };
++ *     nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device,
++ *                                                                profile,
++ *                                                                &profileInfo);
++ * \endcode
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               The identifier of the target device
++ * @param profile                              One of the NVML_GPU_INSTANCE_PROFILE_*
++ * @param info                                 Returns detailed profile information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a profile, \a info, or \a info->version are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled or \a profile isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile,
++                                                          nvmlGpuInstanceProfileInfo_v2_t *info);
++
++/**
++ * Get GPU instance placements.
++ *
++ * A placement represents the location of a GPU instance within a device. This API only returns all the possible
++ * placements for the given profile regardless of whether MIG is enabled or not.
++ * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will
++ * fail if there is overlap with the already occupied memory slices.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param device                               The identifier of the target device
++ * @param profileId                            The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param placements                           Returns placements allowed for the profile. Can be NULL to discover number
++ *                                             of allowed placements for this profile. If non-NULL must be large enough
++ *                                             to accommodate the placements supported by the profile.
++ * @param count                                Returns number of allowed placemenets for the profile.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a profileId or \a count are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't support MIG or \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId,
++                                                                   nvmlGpuInstancePlacement_t *placements,
++                                                                   unsigned int *count);
++
++/**
++ * Get GPU instance profile capacity.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param device                               The identifier of the target device
++ * @param profileId                            The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param count                                Returns remaining instance count for the profile ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a profileId or \a count are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled or \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId,
++                                                               unsigned int *count);
++
++/**
++ * Create GPU instance.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would
++ * become invalid. The GPU instance must be recreated to acquire a valid handle.
++ *
++ * @param device                               The identifier of the target device
++ * @param profileId                            The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param gpuInstance                          Returns the GPU instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                       Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED           If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT        If \a device, \a profile, \a profileId or \a gpuInstance are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED           If \a device doesn't have MIG mode enabled or in vGPU guest
++ *         - \ref NVML_ERROR_NO_PERMISSION           If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_INSUFFICIENT_RESOURCES  If the requested GPU instance could not be created
++ */
++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId,
++                                                 nvmlGpuInstance_t *gpuInstance);
++
++/**
++ * Create GPU instance with the specified placement.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would
++ * become invalid. The GPU instance must be recreated to acquire a valid handle.
++ *
++ * @param device                               The identifier of the target device
++ * @param profileId                            The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param placement                            The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2
++ * @param gpuInstance                          Returns the GPU instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                       Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED           If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT        If \a device, \a profile, \a profileId, \a placement or \a gpuInstance
++ *                                                   are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED           If \a device doesn't have MIG mode enabled or in vGPU guest
++ *         - \ref NVML_ERROR_NO_PERMISSION           If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_INSUFFICIENT_RESOURCES  If the requested GPU instance could not be created
++ */
++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId,
++                                                              const nvmlGpuInstancePlacement_t *placement,
++                                                              nvmlGpuInstance_t *gpuInstance);
++/**
++ * Destroy GPU instance.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param gpuInstance                          The GPU instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled or in vGPU guest
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_IN_USE            If the GPU instance is in use. This error would be returned if processes
++ *                                             (e.g. CUDA application) or compute instances are active on the
++ *                                             GPU instance.
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance);
++
++/**
++ * Get GPU instances for given profile ID.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param device                               The identifier of the target device
++ * @param profileId                            The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param gpuInstances                         Returns pre-exiting GPU instances, the buffer must be large enough to
++ *                                             accommodate the instances supported by the profile.
++ *                                             See \ref nvmlDeviceGetGpuInstanceProfileInfo
++ * @param count                                The count of returned GPU instances
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a profileId, \a gpuInstances or \a count are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId,
++                                               nvmlGpuInstance_t *gpuInstances, unsigned int *count);
++
++/**
++ * Get GPU instances for given instance ID.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param device                               The identifier of the target device
++ * @param id                                   The GPU instance ID
++ * @param gpuInstance                          Returns GPU instance
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a id or \a gpuInstance are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_NOT_FOUND         If the GPU instance is not found.
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance);
++
++/**
++ * Get GPU instance information.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param gpuInstance                          The GPU instance handle
++ * @param info                                 Return GPU instance information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance or \a info are invalid
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info);
++
++/**
++ * Get compute instance profile information.
++ *
++ * Information provided by this API is immutable throughout the lifetime of a MIG mode.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profile                              One of the NVML_COMPUTE_INSTANCE_PROFILE_*
++ * @param engProfile                           One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_*
++ * @param info                                 Returns detailed profile information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance, \a profile, \a engProfile or \a info are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a profile isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile,
++                                                                  unsigned int engProfile,
++                                                                  nvmlComputeInstanceProfileInfo_t *info);
++
++/**
++ * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned
++ * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure.
++ *
++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the
++ * appropriate version prior to calling this function. For example:
++ * \code
++ *     nvmlComputeInstanceProfileInfo_v2_t profileInfo =
++ *         { .version = nvmlComputeInstanceProfileInfo_v2 };
++ *     nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance,
++ *                                                                         profile,
++ *                                                                         engProfile,
++ *                                                                         &profileInfo);
++ * \endcode
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profile                              One of the NVML_COMPUTE_INSTANCE_PROFILE_*
++ * @param engProfile                           One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_*
++ * @param info                                 Returns detailed profile information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a profile isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile,
++                                                                   unsigned int engProfile,
++                                                                   nvmlComputeInstanceProfileInfo_v2_t *info);
++
++/**
++ * Get compute instance profile capacity.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profileId                            The compute instance profile ID.
++ *                                             See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param count                                Returns remaining instance count for the profile ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance, \a profileId or \a availableCount are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance,
++                                                                        unsigned int profileId, unsigned int *count);
++
++/**
++ * Get compute instance placements.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * A placement represents the location of a compute instance within a GPU instance. This API only returns all the possible
++ * placements for the given profile.
++ * A created compute instance occupies compute slices described by its placement. Creation of new compute instance will
++ * fail if there is overlap with the already occupied compute slices.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profileId                            The compute instance profile ID. See \ref  nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param placements                           Returns placements allowed for the profile. Can be NULL to discover number
++ *                                             of allowed placements for this profile. If non-NULL must be large enough
++ *                                             to accommodate the placements supported by the profile.
++ * @param count                                Returns number of allowed placemenets for the profile.
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance, \a profileId or \a count are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled or \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance,
++                                                                         unsigned int profileId,
++                                                                         nvmlComputeInstancePlacement_t *placements,
++                                                                         unsigned int *count);
++
++/**
++ * Create compute instance.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed
++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire
++ * a valid handle.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profileId                            The compute instance profile ID.
++ *                                             See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param computeInstance                      Returns the compute instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                       Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED           If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT        If \a gpuInstance, \a profile, \a profileId or \a computeInstance
++ *                                                   are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED           If \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION           If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_INSUFFICIENT_RESOURCES  If the requested compute instance could not be created
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId,
++                                                          nvmlComputeInstance_t *computeInstance);
++
++/**
++ * Create compute instance with the specified placement.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed
++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire
++ * a valid handle.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profileId                            The compute instance profile ID.
++ *                                             See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param placement                            The requested placement. See \ref nvmlGpuInstanceGetComputeInstancePossiblePlacements
++ * @param computeInstance                      Returns the compute instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                       Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED           If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT        If \a gpuInstance, \a profile, \a profileId or \a computeInstance
++ *                                                   are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED           If \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION           If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_INSUFFICIENT_RESOURCES  If the requested compute instance could not be created
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstanceWithPlacement(nvmlGpuInstance_t gpuInstance, unsigned int profileId,
++                                                                       const nvmlComputeInstancePlacement_t *placement,
++                                                                       nvmlComputeInstance_t *computeInstance);
++
++/**
++ * Destroy compute instance.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param computeInstance                      The compute instance handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a computeInstance is invalid
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_IN_USE            If the compute instance is in use. This error would be returned if
++ *                                             processes (e.g. CUDA application) are active on the compute instance.
++ */
++nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance);
++
++/**
++ * Get compute instances for given profile ID.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param profileId                            The compute instance profile ID.
++ *                                             See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param computeInstances                     Returns pre-exiting compute instances, the buffer must be large enough to
++ *                                             accommodate the instances supported by the profile.
++ *                                             See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo
++ * @param count                                The count of returned compute instances
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a gpuInstance, \a profileId, \a computeInstances or \a count
++ *                                             are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a profileId isn't supported
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId,
++                                                        nvmlComputeInstance_t *computeInstances, unsigned int *count);
++
++/**
++ * Get compute instance for given instance ID.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ * Requires privileged user.
++ *
++ * @param gpuInstance                          The identifier of the target GPU instance
++ * @param id                                   The compute instance ID
++ * @param computeInstance                      Returns compute instance
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a device, \a ID or \a computeInstance are invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     If \a device doesn't have MIG mode enabled
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ *         - \ref NVML_ERROR_NOT_FOUND         If the compute instance is not found.
++ */
++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id,
++                                                           nvmlComputeInstance_t *computeInstance);
++
++/**
++ * Get compute instance information.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param computeInstance                      The compute instance handle
++ * @param info                                 Return compute instance information
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 Upon success
++ *         - \ref NVML_ERROR_UNINITIALIZED     If library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  If \a computeInstance or \a info are invalid
++ *         - \ref NVML_ERROR_NO_PERMISSION     If user doesn't have permission to perform the operation
++ */
++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info);
++
++/**
++ * Test if the given handle refers to a MIG device.
++ *
++ * A MIG device handle is an NVML abstraction which maps to a MIG compute instance.
++ * These overloaded references can be used (with some restrictions) interchangeably
++ * with a GPU device handle to execute queries at a per-compute instance granularity.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               NVML handle to test
++ * @param isMigDevice                          True when handle refers to a MIG device
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a device status was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device handle or \a isMigDevice reference is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this check is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice);
++
++/**
++ * Get GPU instance ID for the given MIG device handle.
++ *
++ * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               Target MIG device handle
++ * @param id                                   GPU instance ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if instance ID was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a id reference is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id);
++
++/**
++ * Get compute instance ID for the given MIG device handle.
++ *
++ * Compute instance IDs are unique per GPU instance and remain valid until the compute instance
++ * is destroyed.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               Target MIG device handle
++ * @param id                                   Compute instance ID
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if instance ID was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a id reference is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id);
++
++/**
++ * Get the maximum number of MIG devices that can exist under a given parent NVML device.
++ *
++ * Returns zero if MIG is not supported or enabled.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               Target device handle
++ * @param count                                Count of MIG devices
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a count was successfully retrieved
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a count reference is invalid
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count);
++
++/**
++ * Get MIG device handle for the given index under its parent NVML device.
++ *
++ * If the compute instance is destroyed either explicitly or by destroying,
++ * resetting or unbinding the parent GPU instance or the GPU device itself
++ * the MIG device handle would remain invalid and must be requested again
++ * using this API. Handles may be reused and their properties can change in
++ * the process.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param device                               Reference to the parent GPU device handle
++ * @param index                                Index of the MIG device
++ * @param migDevice                            Reference to the MIG device handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a migDevice handle was successfully created
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a index or \a migDevice reference is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_NOT_FOUND         if no valid MIG device was found at \a index
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index,
++                                                         nvmlDevice_t *migDevice);
++
++/**
++ * Get parent device handle from a MIG device handle.
++ *
++ * For Ampere &tm; or newer fully supported devices.
++ * Supported on Linux only.
++ *
++ * @param migDevice                            MIG device handle
++ * @param device                               Device handle
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a device handle was successfully created
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a migDevice or \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device);
++
++/** @} */ // @defgroup nvmlMultiInstanceGPU
++
++
++/***************************************************************************************************/
++/** @defgroup GPM NVML GPM
++ *  @{
++ */
++/***************************************************************************************************/
++/** @defgroup nvmlGpmEnums GPM Enums
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * GPM Metric Identifiers
++ */
++typedef enum
++{
++    NVML_GPM_METRIC_GRAPHICS_UTIL           = 1,    //!< Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0
++    NVML_GPM_METRIC_SM_UTIL                 = 2,    //!< Percentage of SMs that were busy. 0.0 - 100.0
++    NVML_GPM_METRIC_SM_OCCUPANCY            = 3,    //!< Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0
++    NVML_GPM_METRIC_INTEGER_UTIL            = 4,    //!< Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0
++    NVML_GPM_METRIC_ANY_TENSOR_UTIL         = 5,    //!< Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0
++    NVML_GPM_METRIC_DFMA_TENSOR_UTIL        = 6,    //!< Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0
++    NVML_GPM_METRIC_HMMA_TENSOR_UTIL        = 7,    //!< Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0
++    NVML_GPM_METRIC_IMMA_TENSOR_UTIL        = 9,    //!< Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0
++    NVML_GPM_METRIC_DRAM_BW_UTIL            = 10,   //!< Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */
++    NVML_GPM_METRIC_FP64_UTIL               = 11,   //!< Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0
++    NVML_GPM_METRIC_FP32_UTIL               = 12,   //!< Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0
++    NVML_GPM_METRIC_FP16_UTIL               = 13,   //!< Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0
++    NVML_GPM_METRIC_PCIE_TX_PER_SEC         = 20,   //!< PCIe traffic from this GPU in MiB/sec
++    NVML_GPM_METRIC_PCIE_RX_PER_SEC         = 21,   //!< PCIe traffic to this GPU in MiB/sec
++    NVML_GPM_METRIC_NVDEC_0_UTIL            = 30,   //!< Percent utilization of NVDEC 0. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_1_UTIL            = 31,   //!< Percent utilization of NVDEC 1. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_2_UTIL            = 32,   //!< Percent utilization of NVDEC 2. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_3_UTIL            = 33,   //!< Percent utilization of NVDEC 3. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_4_UTIL            = 34,   //!< Percent utilization of NVDEC 4. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_5_UTIL            = 35,   //!< Percent utilization of NVDEC 5. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_6_UTIL            = 36,   //!< Percent utilization of NVDEC 6. 0.0 - 100.0
++    NVML_GPM_METRIC_NVDEC_7_UTIL            = 37,   //!< Percent utilization of NVDEC 7. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_0_UTIL            = 40,   //!< Percent utilization of NVJPG 0. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_1_UTIL            = 41,   //!< Percent utilization of NVJPG 1. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_2_UTIL            = 42,   //!< Percent utilization of NVJPG 2. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_3_UTIL            = 43,   //!< Percent utilization of NVJPG 3. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_4_UTIL            = 44,   //!< Percent utilization of NVJPG 4. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_5_UTIL            = 45,   //!< Percent utilization of NVJPG 5. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_6_UTIL            = 46,   //!< Percent utilization of NVJPG 6. 0.0 - 100.0
++    NVML_GPM_METRIC_NVJPG_7_UTIL            = 47,   //!< Percent utilization of NVJPG 7. 0.0 - 100.0
++    NVML_GPM_METRIC_NVOFA_0_UTIL            = 50,   //!< Percent utilization of NVOFA 0. 0.0 - 100.0
++    NVML_GPM_METRIC_NVOFA_1_UTIL            = 51,   //!< Percent utilization of NVOFA 1. 0.0 - 100.0
++    NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60,   //!< NvLink read bandwidth for all links in MiB/sec
++    NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61,   //!< NvLink write bandwidth for all links in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC    = 62,   //!< NvLink read bandwidth for link 0 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC    = 63,   //!< NvLink write bandwidth for link 0 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC    = 64,   //!< NvLink read bandwidth for link 1 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC    = 65,   //!< NvLink write bandwidth for link 1 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC    = 66,   //!< NvLink read bandwidth for link 2 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC    = 67,   //!< NvLink write bandwidth for link 2 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC    = 68,   //!< NvLink read bandwidth for link 3 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC    = 69,   //!< NvLink write bandwidth for link 3 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC    = 70,   //!< NvLink read bandwidth for link 4 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC    = 71,   //!< NvLink write bandwidth for link 4 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC    = 72,   //!< NvLink read bandwidth for link 5 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC    = 73,   //!< NvLink write bandwidth for link 5 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC    = 74,   //!< NvLink read bandwidth for link 6 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC    = 75,   //!< NvLink write bandwidth for link 6 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC    = 76,   //!< NvLink read bandwidth for link 7 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC    = 77,   //!< NvLink write bandwidth for link 7 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC    = 78,   //!< NvLink read bandwidth for link 8 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC    = 79,   //!< NvLink write bandwidth for link 8 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC    = 80,   //!< NvLink read bandwidth for link 9 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC    = 81,   //!< NvLink write bandwidth for link 9 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC   = 82,   //!< NvLink read bandwidth for link 10 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC   = 83,   //!< NvLink write bandwidth for link 10 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC   = 84,   //!< NvLink read bandwidth for link 11 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC   = 85,   //!< NvLink write bandwidth for link 11 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC   = 86,   //!< NvLink read bandwidth for link 12 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC   = 87,   //!< NvLink write bandwidth for link 12 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC   = 88,   //!< NvLink read bandwidth for link 13 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC   = 89,   //!< NvLink write bandwidth for link 13 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC   = 90,   //!< NvLink read bandwidth for link 14 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC   = 91,   //!< NvLink write bandwidth for link 14 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC   = 92,   //!< NvLink read bandwidth for link 15 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC   = 93,   //!< NvLink write bandwidth for link 15 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC   = 94,   //!< NvLink read bandwidth for link 16 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC   = 95,   //!< NvLink write bandwidth for link 16 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC   = 96,   //!< NvLink read bandwidth for link 17 in MiB/sec
++    NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC   = 97,   //!< NvLink write bandwidth for link 17 in MiB/sec
++    //Put new metrics for BLACKWELL here...
++    NVML_GPM_METRIC_MAX                     = 98,   //!< Maximum value above +1. Note that changing this should also change NVML_GPM_METRICS_GET_VERSION due to struct size change
++} nvmlGpmMetricId_t;
++
++/** @} */ // @defgroup nvmlGpmEnums
++
++
++/***************************************************************************************************/
++/** @defgroup nvmlGpmStructs GPM Structs
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc(). Free this with nvmlGpmSampleFree().
++ */
++typedef struct nvmlGpmSample_st* nvmlGpmSample_t;
++
++/**
++ * GPM metric information.
++ */
++typedef struct
++{
++    unsigned int metricId;   //!<  IN: NVML_GPM_METRIC_? define of which metric to retrieve
++    nvmlReturn_t nvmlReturn; //!<  OUT: Status of this metric. If this is nonzero, then value is not valid
++    double value;            //!<  OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS)
++    struct
++    {
++        char *shortName;
++        char *longName;
++        char *unit;
++    } metricInfo;            //!< OUT: Metric name and unit. Those can be NULL if not defined
++} nvmlGpmMetric_t;
++
++/**
++ * GPM buffer information.
++ */
++typedef struct
++{
++    unsigned int version;                              //!< IN: Set to NVML_GPM_METRICS_GET_VERSION
++    unsigned int numMetrics;                           //!< IN: How many metrics to retrieve in metrics[]
++    nvmlGpmSample_t sample1;                           //!< IN: Sample buffer
++    nvmlGpmSample_t sample2;                           //!< IN: Sample buffer
++    nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX];      //!< IN/OUT: Array of metrics. Set metricId on call. See nvmlReturn and value on return
++} nvmlGpmMetricsGet_t;
++
++#define NVML_GPM_METRICS_GET_VERSION 1
++
++/**
++ * GPM device information.
++ */
++typedef struct
++{
++    unsigned int version;           //!< IN: Set to NVML_GPM_SUPPORT_VERSION
++    unsigned int isSupportedDevice; //!< OUT: Indicates device support
++} nvmlGpmSupport_t;
++
++#define NVML_GPM_SUPPORT_VERSION 1
++
++/** @} */ // @defgroup nvmlGPMStructs
++
++/***************************************************************************************************/
++/** @defgroup nvmlGpmFunctions GPM Functions
++ *  @{
++ */
++/***************************************************************************************************/
++
++/**
++ * Calculate GPM metrics from two samples.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param metricsGet             IN/OUT: populated \a nvmlGpmMetricsGet_t struct
++ *
++ * @return
++ *         - \ref NVML_SUCCESS on success
++ *         - Nonzero NVML_ERROR_? enum on error
++ */
++nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet);
++
++
++/**
++ * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc()
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param gpmSample              Sample to free
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                on success
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
++ */
++nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample);
++
++
++/**
++ * Allocate a sample buffer to be used with NVML GPM . You will need to allocate
++ * at least two of these buffers to use with the NVML GPM feature
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param gpmSample             Where  the allocated sample will be stored
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                on success
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
++ *         - \ref NVML_ERROR_MEMORY           if system memory is insufficient
++ */
++nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample);
++
++/**
++ * Read a sample of GPM metrics into the provided \a gpmSample buffer. After
++ * two samples are gathered, you can call nvmlGpmMetricGet on those samples to
++ * retrive metrics
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param device                Device to get samples for
++ * @param gpmSample             Buffer to read samples into
++ *
++ * @return
++ *         - \ref NVML_SUCCESS on success
++ *         - Nonzero NVML_ERROR_? enum on error
++ */
++nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample);
++
++/**
++ * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance.
++ *
++ * After two samples are gathered, you can call nvmlGpmMetricGet on those
++ * samples to retrive metrics
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ *
++ * @param device                Device to get samples for
++ * @param gpuInstanceId         MIG GPU Instance ID
++ * @param gpmSample             Buffer to read samples into
++ *
++ * @return
++ *         - \ref NVML_SUCCESS on success
++ *         - Nonzero NVML_ERROR_? enum on error
++ */
++nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample);
++
++/**
++ * Indicate whether the supplied device supports GPM
++ *
++ * @param device                NVML device to query for
++ * @param gpmSupport            Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates
++ *                              GPM support per system for the supplied device
++ *
++ * @return
++ *         - NVML_SUCCESS on success
++ *         - Nonzero NVML_ERROR_? enum if there is an error in processing the query
++ */
++nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport);
++
++/* GPM Stream State */
++/**
++ * Get GPM stream state.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               The identifier of the target device
++ * @param state                                Returns GPM stream state
++ *                                             NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a current GPM stream state were successfully queried
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a  device is invalid or \a state is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlGpmQueryIfStreamingEnabled(nvmlDevice_t device, unsigned int *state);
++
++/**
++ * Set GPM stream state.
++ *
++ * For Hopper &tm; or newer fully supported devices.
++ * Supported on Linux, Windows TCC.
++ *
++ * @param device                               The identifier of the target device
++ * @param state                                GPM stream state,
++ *                                             NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                 if \a current GPM stream state is successfully set
++ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
++ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
++ */
++nvmlReturn_t DECLDIR nvmlGpmSetStreamingEnabled(nvmlDevice_t device, unsigned int state);
++
++/** @} */ // @defgroup nvmlGpmFunctions
++/** @} */ // @defgroup GPM
++
++#define NVML_DEV_CAP_EGM (1 << 0) // Extended GPU memory
++/**
++ * Device capabilities
++ */
++typedef struct
++{
++    unsigned int version;               //!< the API version number
++    unsigned int capMask;               //!< OUT: Bit mask of capabilities.
++} nvmlDeviceCapabilities_v1_t;
++typedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t;
++#define nvmlDeviceCapabilities_v1 NVML_STRUCT_VERSION(DeviceCapabilities, 1)
++
++/**
++ * Get device capabilities
++ *
++ * See \ref  nvmlDeviceCapabilities_v1_t for more information on the struct.
++ *
++ * @param device                               The identifier of the target device
++ * @param caps                                 Returns GPU's capabilities
++ *
++ * @return
++ *         - \ref NVML_SUCCESS                         if the query is success
++ *         - \ref NVML_ERROR_UNINITIALIZED             if the library has not been successfully initialized
++ *         - \ref NVML_ERROR_INVALID_ARGUMENT          if \a device is invalid or \a counters is NULL
++ *         - \ref NVML_ERROR_NOT_SUPPORTED             if the device does not support this feature
++ *         - \ref NVML_ERROR_GPU_IS_LOST               if the target GPU has fallen off the bus or is otherwise inaccessible
++ *         - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported
++ *         - \ref NVML_ERROR_UNKNOWN                   on any unexpected error
++ */
++nvmlReturn_t DECLDIR nvmlDeviceGetCapabilities(nvmlDevice_t device,
++                                               nvmlDeviceCapabilities_t *caps);
++
++/**
++ * NVML API versioning support
++ */
++
++#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS
++nvmlReturn_t DECLDIR nvmlInit(void);
++nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci);
++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo);
++nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes);
++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info);
++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos);
++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count);
++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo);
++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
++#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS
++
++#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS)
++// We don't define APIs to run new versions if this guard is present so there is
++// no need to undef
++#elif defined(__NVML_API_VERSION_INTERNAL)
++#undef nvmlDeviceGetGraphicsRunningProcesses
++#undef nvmlDeviceGetComputeRunningProcesses
++#undef nvmlDeviceGetMPSComputeRunningProcesses
++#undef nvmlDeviceGetAttributes
++#undef nvmlComputeInstanceGetInfo
++#undef nvmlEventSetWait
++#undef nvmlDeviceGetGridLicensableFeatures
++#undef nvmlDeviceRemoveGpu
++#undef nvmlDeviceGetNvLinkRemotePciInfo
++#undef nvmlDeviceGetPciInfo
++#undef nvmlDeviceGetCount
++#undef nvmlDeviceGetHandleByIndex
++#undef nvmlDeviceGetHandleByPciBusId
++#undef nvmlInit
++#undef nvmlBlacklistDeviceInfo_t
++#undef nvmlGetBlacklistDeviceCount
++#undef nvmlGetBlacklistDeviceInfoByIndex
++#undef nvmlDeviceGetGpuInstancePossiblePlacements
++#undef nvmlVgpuInstanceGetLicenseInfo
++#undef nvmlDeviceGetDriverModel
++#undef nvmlDeviceSetPowerManagementLimit
++
++#endif
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+diff --git a/contrib/nvml.py b/contrib/nvml.py
+index 9f2c57d..2516979 100644
+--- a/contrib/nvml.py
++++ b/contrib/nvml.py
+@@ -1,6 +1,7 @@
+ import re
++import os
+ 
+-PATH="/usr/local/cuda/include/nvml.h"
++PATH=["./contrib/nvml.h", "/usr/local/cuda/include/nvml.h"]
+ func = ["nvmlInit",
+         "nvmlDeviceGetSupportedEventTypes",
+         "nvmlDeviceRegisterEvents",
+@@ -22,7 +23,13 @@ type_pattern = re.compile(
+         flags=re.MULTILINE
+ )
+ 
+-with open(PATH, 'r') as file:
++path=""
++if os.path.exists(PATH[0]) and os.access(PATH[0], os.R_OK):
++        path = PATH[0]
++else:
++        path = PATH[1]
++
++with open(path, 'r') as file:
+         content = file.read()
+         matched_lines = pattern.findall(content)
+         type_lines = type_pattern.findall(content)
+@@ -55,7 +62,7 @@ print('''
+ )
+ print('#include <dlfcn.h>\
+         \n#include <stdio.h>\
+-        \n#include "/usr/local/cuda/include/nvml.h"')
++        \n#include "{}"'.format(path))
+ print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);')
+ print('\n'.join(func_declares))
+ print('\nmy_nvmlErrorString_p my_nvmlErrorString;')
+-- 
+2.43.5
+
diff --git a/1024-anolis-do-not-print-teq-error.patch b/1024-anolis-do-not-print-teq-error.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4fbe782120b8c768ea8e0a55945f60751fd50adc
--- /dev/null
+++ b/1024-anolis-do-not-print-teq-error.patch
@@ -0,0 +1,50 @@
+From c6a9ca106c41e1f351849bce5d491bba3813cc10 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Thu, 17 Apr 2025 17:26:48 +0800
+Subject: [PATCH 24/30] anolis: do not print teq error
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ ras-cxl-handler.c | 2 +-
+ ras-mce-handler.c | 6 +++---
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
+index 575fff8..55509f1 100644
+--- a/ras-cxl-handler.c
++++ b/ras-cxl-handler.c
+@@ -718,7 +718,7 @@ static int handle_ras_cxl_common_hdr(struct trace_seq *s,
+ 	if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0)
+ 		return -1;
+ 
+-	if (tep_get_field_val(s,  event, "hdr_maint_op_sub_class", record, &val, 1) < 0)
++	if (tep_get_field_val(s,  event, "hdr_maint_op_sub_class", record, &val, 0) < 0)
+ 		return -1;
+ 	hdr->hdr_maint_op_sub_class = val;
+ 	if (trace_seq_printf(s, "hdr_maint_op_sub_class:%u ", hdr->hdr_maint_op_sub_class) <= 0)
+diff --git a/ras-mce-handler.c b/ras-mce-handler.c
+index fc2e8d4..0f0d37f 100644
+--- a/ras-mce-handler.c
++++ b/ras-mce-handler.c
+@@ -571,15 +571,15 @@ int ras_mce_event_handler(struct trace_seq *s,
+ 	e.ipid = val;
+ 
+ 	/* Get PPIN */
+-	if (!tep_get_field_val(s, event, "ppin", record, &val, 1))
++	if (!tep_get_field_val(s, event, "ppin", record, &val, 0))
+ 		e.ppin = val;
+ 
+ 	/* Get Microcode Revision */
+-	if (!tep_get_field_val(s, event, "microcode", record, &val, 1))
++	if (!tep_get_field_val(s, event, "microcode", record, &val, 0))
+ 		e.microcode = val;
+ 
+ 	/* Get Vendor-specfic Data, if any */
+-	e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1);
++	e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 0);
+ 
+ 	switch (mce->cputype) {
+ 	case CPU_GENERIC:
+-- 
+2.43.5
+
diff --git a/1025-anolis-add-init.sh-for-different-user.patch b/1025-anolis-add-init.sh-for-different-user.patch
new file mode 100644
index 0000000000000000000000000000000000000000..90dd4af91b6a349ef2135d5b1b12e3753b974ef5
--- /dev/null
+++ b/1025-anolis-add-init.sh-for-different-user.patch
@@ -0,0 +1,104 @@
+From bec7414b742dc7164d7674a0eb9489c4723514ab Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 18 Apr 2025 15:43:57 +0800
+Subject: [PATCH 25/30] anolis: add init.sh for different user
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am            |  1 +
+ contrib/rasdaemon.init | 26 ++++++++++++++++++++++++++
+ misc/rasdaemon.spec.in | 18 ++++++++++++------
+ 3 files changed, 39 insertions(+), 6 deletions(-)
+ create mode 100644 contrib/rasdaemon.init
+
+diff --git a/Makefile.am b/Makefile.am
+index 4aba962..203b576 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -24,6 +24,7 @@ EXTRA_DIST = \
+ 	$(RSYSLOG_EXT_SERVICES_IN) \
+ 	misc/rasdaemon.env \
+ 	misc/notices \
++	contrib/rasdaemon.init \
+ 	contrib/nvml.py \
+ 	contrib/nvml.h \
+ 	contrib/*_trigger
+diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init
+new file mode 100644
+index 0000000..d575af9
+--- /dev/null
++++ b/contrib/rasdaemon.init
+@@ -0,0 +1,26 @@
++#!/bin/sh
++target=$1
++ENV_PATH="/etc/sysconfig/rasdaemon"
++
++case "$target" in
++	ecs)
++		sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH}
++		;;
++	ebs)
++		sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH}
++		sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH}
++		sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH}
++		sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH}
++		sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH}
++		;;
++	jituan)
++		sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH}
++		sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH}
++		exit 1
++		;;
++        zhuanyou)
++		sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH}
++		sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH}
++                ;;
++
++esac
+\ No newline at end of file
+diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
+index 23be188..bf4cc4b 100644
+--- a/misc/rasdaemon.spec.in
++++ b/misc/rasdaemon.spec.in
+@@ -61,6 +61,7 @@ install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{
+ install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext
+ install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/
+ install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/
++install -D -p -m 0755 contrib/%{name}.init %{buildroot}/usr/share/%{name}/%{name}.init
+ rm INSTALL %{buildroot}/usr/include/*.h
+ 
+ %files
+@@ -71,12 +72,13 @@ rm INSTALL %{buildroot}/usr/include/*.h
+ %{_unitdir}/*.service
+ %{_sysconfdir}/ras/dimm_labels.d
+ %{_sysconfdir}/ras/*/*
+-%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
+-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng
+-%config(noreplace) /usr/share/%{name}/%{name}.logrotate
+-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog
+-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext
+-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext
++%{_sysconfdir}/sysconfig/%{name}
++/usr/share/%{name}/%{name}.syslog-ng
++/usr/share/%{name}/%{name}.logrotate
++/usr/share/%{name}/%{name}.rsyslog
++/usr/share/%{name}/%{name}.syslog-ng-ext
++/usr/share/%{name}/%{name}.rsyslog-ext
++/usr/share/%{name}/%{name}.init
+ %{_sysconfdir}/rasdaemon_notices/*
+ 
+ %post
+@@ -104,6 +106,10 @@ if ! systemctl is-enabled --quiet %{name}.service; then
+     echo "Rasdaemon service is not enabled, enable it";
+     systemctl enable %{name}.service;
+ fi
++echo "Rasdaemon install for ${RASDAEMON_TARGET}";
++/usr/share/%{name}/%{name}.init ${RASDAEMON_TARGET}
++
++systemctl daemon-reload
+ systemctl restart %{name}.service
+ 
+ %preun
+-- 
+2.43.5
+
diff --git a/1026-anolis-fix-systemd-config.patch b/1026-anolis-fix-systemd-config.patch
new file mode 100644
index 0000000000000000000000000000000000000000..45770982f50740631b332e895af7bd2785fe6587
--- /dev/null
+++ b/1026-anolis-fix-systemd-config.patch
@@ -0,0 +1,30 @@
+From 09d282c32c52224af0b7310b24e6ddf4cd4efb61 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 18 Apr 2025 16:47:46 +0800
+Subject: [PATCH 26/30] anolis: fix systemd config
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ misc/rasdaemon.service.in | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in
+index 0bb643f..c72b2d7 100644
+--- a/misc/rasdaemon.service.in
++++ b/misc/rasdaemon.service.in
+@@ -7,10 +7,10 @@ Description=RAS daemon to log the RAS events
+ 
+ [Service]
+ EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon
+-ExecStart=@sbindir@/rasdaemon -f -r
++ExecStart=@sbindir@/rasdaemon -f
+ ExecStartPost=@sbindir@/rasdaemon --enable
+ ExecStop=@sbindir@/rasdaemon --disable
+-Restart=on-abort
++Restart=always
+ 
+ [Install]
+ WantedBy=multi-user.target
+-- 
+2.43.5
+
diff --git a/1027-anolis-add-nvgpu-driver.patch b/1027-anolis-add-nvgpu-driver.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ca4bc5c0dfa43357b0e20589b39f8630e69378bd
--- /dev/null
+++ b/1027-anolis-add-nvgpu-driver.patch
@@ -0,0 +1,590 @@
+From ed059449efe2ce84e1c7cffdc5502430052c043e Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Wed, 23 Apr 2025 11:17:32 +0800
+Subject: [PATCH 1/3] anolis: add nvgpu driver
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ Makefile.am        |  22 ++-
+ configure.ac       |   5 +
+ ras-nvgpu-driver.c | 444 +++++++++++++++++++++++++++++++++++++++++++++
+ ras-nvgpu-nvml.c   |   2 -
+ ras-nvgpu.c        |  10 +-
+ ras-nvgpu.h        |   2 +
+ 6 files changed, 476 insertions(+), 9 deletions(-)
+ create mode 100644 ras-nvgpu-driver.c
+
+diff --git a/Makefile.am b/Makefile.am
+index 203b576..c400473 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -27,7 +27,9 @@ EXTRA_DIST = \
+ 	contrib/rasdaemon.init \
+ 	contrib/nvml.py \
+ 	contrib/nvml.h \
+-	contrib/*_trigger
++	contrib/*_trigger \
++	libnvgpudriver_x86_64.a \
++	libnvgpudriver_aarch64.a
+ 
+ CLEANFILES= \
+ 	ras-nvgpu-nvml.h \
+@@ -148,14 +150,16 @@ if WITH_ERST
+ endif
+ 
+ if WITH_NVGPU
+-   BUILT_SOURCES = ras-nvgpu-nvml.h
++   BUILT_SOURCES = ras-nvgpu-nvml.h libnvgpudriver.a
+ ras-nvgpu-nvml.h: contrib/nvml.py
+ 	python3 $< > $@
++libnvgpudriver.a: nvgpu_driver
++	cp libnvgpudriver_$(shell uname -m).a $@
+    rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c
+ endif
+ 
+-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS)
+-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS)
++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) $(NVGPU_LIBS)
++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) $(NVGPU_CFLAGS)
+ 
+ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
+ 		  ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
+@@ -210,3 +214,13 @@ install-data-local:
+ 		install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \
+ 	fi
+ 	$(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/"
++
++nvgpu_driver:
++	if [ ! -d "open-gpu-kernel-modules" ]; then git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git -b 570; fi
++	gcc -o ras-nvgpu-driver.o -I./open-gpu-kernel-modules/kernel-open/common/inc \
++		-I./open-gpu-kernel-modules/kernel-open/nvidia-uvm \
++		-I./open-gpu-kernel-modules/src/common/sdk/nvidia/inc \
++		-I./open-gpu-kernel-modules/src/nvidia/arch/nvalloc/unix/include \
++		$(LIBTRACEEVENT_LIBS) \
++		-O2 -fPIE -c ras-nvgpu-driver.c
++	ar rcs libnvgpudriver_$(shell uname -m).a ras-nvgpu-driver.o
+diff --git a/configure.ac b/configure.ac
+index 68fcb75..46ba36e 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -303,10 +303,15 @@ AC_ARG_ENABLE([nvgpu],
+ AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [
+   AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect")
+   AC_SUBST([WITH_NVGPU])
++  NVGPU_LIBS="-lnvgpudriver"
++  NVGPU_CFLAGS="-L."
+ ])
+ AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes])
+ AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"])
+ 
++AC_SUBST([NVGPU_LIBS])
++AC_SUBST([NVGPU_CFLAGS])
++
+ AC_ARG_ENABLE([kmsg_monitor],
+     AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)]))
+ 
+diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c
+new file mode 100644
+index 0000000..a72a7c5
+--- /dev/null
++++ b/ras-nvgpu-driver.c
+@@ -0,0 +1,444 @@
++
++#include "nvtypes.h"
++#include <class/cl0005.h>
++#include <class/cl0080.h>  // NV01_DEVICE_0
++#include <class/cl2080.h>  // NV20_SUBDEVICE_0
++
++#include <ctrl/ctrl0000/ctrl0000gpu.h>
++#include <ctrl/ctrlc36f.h> // VOLTA_CHANNELChannelGPFifoA
++#include <ctrl/ctrl503c.h>  // NV20_SUBDEVICE_0
++#include <ctrl/ctrl2080/ctrl2080event.h>  // NV20_SUBDEVICE_0
++
++#include <stddef.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <assert.h>
++#include <string.h>
++#include <sys/syslog.h>
++#include <sys/types.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/ioctl.h>
++#include <poll.h>
++#include <time.h>
++
++#include "ras-logger.h"
++#include "ras-nvgpu.h"
++#include <traceevent/trace-seq.h>
++#define NV_PLATFORM_MAX_IOCTL_SIZE 16384
++#include "nv.h"
++#include "nvos.h"
++#include "nv_escape.h"
++
++#include "nvstatus.h"
++
++#define NV_PRINTF_STRING_SECTION
++#undef NV_STATUS_CODE
++#undef SDK_NVSTATUSCODES_H
++#define NV_STATUS_CODE( name, code, string ) static NV_PRINTF_STRING_SECTION   \
++    const char rm_pvt_##name##_str[] = string " [" #name "]";
++#include "nvstatuscodes.h"
++
++#undef NV_STATUS_CODE
++#undef SDK_NVSTATUSCODES_H
++#define NV_STATUS_CODE( name, code, string ) [code] = { name, rm_pvt_##name##_str },
++static struct NvStatusCodeString
++{
++    NV_STATUS   statusCode;
++    const char *statusString;
++} g_StatusCodeList[] = {
++   #include "nvstatuscodes.h"
++};
++#undef NV_STATUS_CODE
++
++#include <stdlib.h>
++
++#define assert_with_message(condition, message, ...) \
++	do { \
++		if (!(condition)) { \
++			log(ALL, LOG_ERR, "%s Assertion failed: %s: " message "\n", \
++				__func__, #condition, ##__VA_ARGS__); \
++			ret = 1; \
++		} \
++	} while (0)
++
++#define nv_assert_ioctl(fd, cmd, p) \
++	do { \
++		int r = ioctl(fd, __NV_IOWR(cmd, p), &p); \
++		assert_with_message(r == 0, "%s", strerror(r)); \
++		assert_with_message(p.status == 0, "%s", g_StatusCodeList[p.status].statusString); \
++	} while (0)
++
++#define error_exit(a, free) \
++	do { \
++		a; \
++		if (ret) goto free; \
++	} while (0)
++
++static int ret;
++static void alloc_root(int fd_ctl, NvHandle *root) {
++	NVOS64_PARAMETERS p = {
++		.hClass = NV01_ROOT_CLIENT
++	};
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p);
++	*root = p.hObjectNew;
++}
++
++static void free_nvgpu(int fd_ctl, NvHandle root, NvHandle obj, NvHandle old_obj) {
++	NVOS00_PARAMETERS p = {
++		.hRoot = root, .hObjectParent = obj, .hObjectOld = old_obj
++	};
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_FREE, p);
++}
++
++static void alloc_device(int fd_ctl, NvHandle root, NV0080_ALLOC_PARAMETERS *dev, NvHandle *device) {
++	NVOS64_PARAMETERS p = {
++		.hRoot = root, .hObjectParent = root, .hClass = NV01_DEVICE_0, .pAllocParms = dev, .paramsSize = sizeof(*dev)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p);
++	*device = p.hObjectNew;
++}
++
++static void alloc_subdevice(int fd_ctl, NvHandle root, NvHandle parent, NV2080_ALLOC_PARAMETERS *subdev, NvHandle *subdevice) {
++	NVOS64_PARAMETERS p = {
++		.hRoot = root, .hObjectParent = parent, .hClass = NV20_SUBDEVICE_0, .pAllocParms = subdev, .paramsSize = sizeof(*subdev)
++	};
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p);
++
++	*subdevice = p.hObjectNew;
++}
++
++static void wait_open(int fd_dev)
++{
++	nv_ioctl_wait_open_complete_t p = { 0 };
++
++	int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_WAIT_OPEN_COMPLETE, p), &p);
++	assert_with_message(ret == 0, "%s", strerror(ret));
++}
++
++static void get_pci(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS *pci) {
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PCI_INFO, .params = pci, .paramsSize = sizeof(*pci)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void attach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_ATTACH_IDS_PARAMS *attach) {
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_ATTACH_IDS, .params = attach, .paramsSize = sizeof(*attach)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void deattach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_DETACH_IDS_PARAMS *attach) {
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_DETACH_IDS, .params = attach, .paramsSize = sizeof(*attach)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void get_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS *probe) {
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PROBED_IDS, .params = probe, .paramsSize = sizeof(*probe)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void get_id_info(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_ID_INFO_PARAMS *info) {
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_ID_INFO, .params = info, .paramsSize = sizeof(*info)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void register_fd(int fd_dev, int fd_ctl) {
++	nv_ioctl_register_fd_t p = { .ctl_fd = fd_ctl };
++	int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_REGISTER_FD, p), &p);
++	assert(ret == 0);
++}
++
++static void alloc_event(int fd_dev, NvHandle root, NvHandle device, int fd_uvm) {
++	nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_uvm };
++	int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p);
++	assert(ret == 0);
++}
++
++static void free_event(int fd_dev, NvHandle root, NvHandle device) {
++	nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_dev };
++	int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p);
++	assert(ret == 0);
++}
++
++static void event_os_event(int fd_dev, NvHandle root, NvHandle subdevice, int index, NvHandle *event, int fd_uvm) {
++	NV0005_ALLOC_PARAMETERS pp = { .hParentClient = root, .data = (NvP64)fd_uvm, .notifyIndex = index, .hClass = NV01_EVENT_OS_EVENT };
++
++	NVOS64_PARAMETERS p = {
++		.hRoot = root, .hObjectParent = subdevice, .hClass = NV01_EVENT_OS_EVENT, .pAllocParms = &pp, .paramsSize = sizeof(pp)
++	};
++
++	nv_assert_ioctl(fd_dev, NV_ESC_RM_ALLOC, p);
++	*event = p.hObjectNew;
++}
++
++static void set_event(int fd_ctl, NvHandle root, NvHandle subdevice, int index, int type)
++{
++	NV2080_CTRL_EVENT_SET_NOTIFICATION_PARAMS set = { .event = index, .action = type, .bNotifyState = 0 };
++
++	NVOS54_PARAMETERS p = {
++		.hClient = root, .hObject = subdevice, .cmd = NV2080_CTRL_CMD_EVENT_SET_NOTIFICATION, .params = &set, .paramsSize = sizeof(set)
++	};
++
++	nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p);
++}
++
++static void get_event(NvUnixEvent *event, int fd_dev, int fd_uvm, NvHandle root, NvHandle subdevice, int i)
++{
++	NVOS41_PARAMETERS p = { .pEvent = event, .MoreEvents = 0 };
++	int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_RM_GET_EVENT_DATA, p), &p);
++	assert(ret == 0);
++}
++
++struct ras_nvgpu_event {
++	NvHandle event;
++	NvV32 index;
++};
++
++#define NVGPU_EVENT_NUM 10
++struct ras_nvgpu_driver {
++	NvHandle device;
++	NvHandle subdevice;
++	NvU32 gpu_id;
++	int fd;
++	NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci;
++	struct ras_nvgpu_event events[NVGPU_EVENT_NUM];
++};
++
++static int event_index[NVGPU_EVENT_NUM] = {
++	NV2080_NOTIFIERS_RC_ERROR,
++	NV2080_NOTIFIERS_ECC_DBE,
++	NV2080_NOTIFIERS_NVLINK_ERROR_FATAL,
++	NV2080_NOTIFIERS_NVLINK_ERROR_RECOVERY_REQUIRED,
++	NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL,
++	NV2080_NOTIFIERS_POISON_ERROR_FATAL,
++	NV2080_NOTIFIERS_NVLINK_INFO_LINK_DOWN,
++	NV2080_NOTIFIERS_ECC_SBE_STORM,
++	NV2080_NOTIFIERS_NVLINK_UNCONTAINED_ERROR,
++	NV2080_NOTIFIERS_GPU_UNAVAILABLE
++};
++
++static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *event)
++{
++	struct trace_seq s;
++	time_t now;
++	struct tm *tm;
++	char timestamp[64];
++
++	time(&now);
++	tm = localtime(&now);
++
++	if (tm)
++		strftime(timestamp, sizeof(timestamp),
++			 "%Y-%m-%d %H:%M:%S %z", tm);
++
++	trace_seq_init(&s);
++	if (event->NotifyIndex == NV2080_NOTIFIERS_RC_ERROR) {
++		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
++			"<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME);
++		trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp);
++		trace_seq_printf(&s, "xid: %d ", event->info32);
++		trace_seq_printf(&s, "data1: %d ", event->info16);
++	} else {
++		trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ",
++			"<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME);
++		trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp);
++		trace_seq_printf(&s, "event_type: %d ", event->NotifyIndex);
++		trace_seq_printf(&s, "data: %d ", event->info32);
++		trace_seq_printf(&s, "data1: %d ", event->info16);
++
++	}
++
++	trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot);
++
++	trace_seq_terminate(&s);
++	trace_seq_do_printf(&s);
++	printf("\n");
++	fflush(stdout);
++	trace_seq_destroy(&s);
++
++	return 0;
++}
++
++int ras_nvgpu_driver_handle(void) {
++	int fd_ctl = 0, fd_uvm = 0, i, gpu_count = 0;
++	NvHandle root = 0;
++	struct pollfd *pfd;
++
++	fd_uvm = open("/dev/nvidia-uvm", O_RDWR | O_CLOEXEC);
++	if (fd_ctl < 0) {
++		perror("open");
++		return 1;
++	}
++
++	fd_ctl = open("/dev/nvidiactl", O_RDWR | O_CLOEXEC);
++	if (fd_ctl < 0) {
++		perror("open");
++		ret = 1;
++		goto close_uvm;
++	}
++
++	error_exit(alloc_root(fd_ctl, &root), close);
++
++	NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS id = {0};
++	NV0000_CTRL_GPU_ATTACH_IDS_PARAMS attach = {0};
++	NV0000_CTRL_GPU_DETACH_IDS_PARAMS detach = {0};
++	error_exit(get_id(fd_ctl, root, &id), free_root);
++
++	for (i = 0; i < NV0000_CTRL_GPU_MAX_PROBED_GPUS; i++) {
++		if (id.gpuIds[i] == NV0000_CTRL_GPU_INVALID_ID)
++			break;
++
++		attach.gpuIds[i] = id.gpuIds[i];
++		detach.gpuIds[i] = id.gpuIds[i];
++	}
++	gpu_count = i;
++	attach.gpuIds[i] = NV0000_CTRL_GPU_INVALID_ID;
++
++	error_exit(attach_id(fd_ctl, root, &attach), free_root);
++
++	struct ras_nvgpu_driver *nvgpus = calloc(gpu_count, sizeof(struct ras_nvgpu_driver));
++	if (!nvgpus) {
++		log(ALL, LOG_ERR, "nvgpu alloc error\n");
++		ret = 1;
++		goto detach;
++	}
++
++	for (i = 0; i < gpu_count; i++) {
++		char path[32];
++		struct ras_nvgpu_driver *nvgpu = &nvgpus[i];
++		NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci = {0};
++		NV0000_CTRL_GPU_GET_ID_INFO_PARAMS info = {0};
++		NV0080_ALLOC_PARAMETERS dev = { 0 };
++		NV2080_ALLOC_PARAMETERS subdev = { 0 };
++		NvU32 gpu_id = id.gpuIds[i];
++		int fd;
++
++		nvgpu->gpu_id = gpu_id;
++		snprintf(path, 32, "/dev/nvidia%d", i);
++		nvgpu->fd = open(path, O_RDWR | O_CLOEXEC);
++		if (nvgpu->fd < 0) {
++			log(ALL, LOG_ERR, "nvgpu open error\n");
++			goto free_nvgpu;
++		}
++		fd = nvgpu->fd;
++
++		error_exit(wait_open(fd), free_nvgpu);
++
++		pci.gpuId = gpu_id;
++		error_exit(get_pci(fd_ctl, root, &pci), free_nvgpu);
++		nvgpu->pci = pci;
++
++		info.gpuId = id.gpuIds[i];
++		error_exit(get_id_info(fd_ctl, root, &info), free_nvgpu);
++
++		error_exit(register_fd(fd, fd_ctl), free_nvgpu);
++
++		dev.deviceId = info.deviceInstance;
++		error_exit(alloc_device(fd_ctl, root, &dev, &nvgpu->device), free_nvgpu);
++
++		subdev.subDeviceId = info.subDeviceInstance;
++		error_exit(alloc_subdevice(fd_ctl, root, nvgpu->device, &subdev, &nvgpu->subdevice), free_nvgpu);
++
++		error_exit(alloc_event(fd, root, nvgpu->device, fd_uvm), free_nvgpu);
++
++		for (int j = 0; j < NVGPU_EVENT_NUM; j++) {
++			struct ras_nvgpu_event *event = &nvgpu->events[j];
++			event->index = event_index[j];
++
++			event_os_event(fd, root, nvgpu->subdevice, event->index, &event->event, fd_uvm);
++			if (ret) {
++				log(ALL, LOG_ERR, "nvgpu event %d register error\n", event->index);
++				ret = 0;
++				continue;
++			}
++			set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_REPEAT);
++			if (ret) {
++				log(ALL, LOG_ERR, "nvgpu event %d set error\n", event->index);
++				free_nvgpu(fd_ctl, root, nvgpu->subdevice, event->event);
++				ret = 0;
++				continue;
++			}
++		}
++		log(ALL, LOG_INFO, "GPU %d: %04x:%02x:%02x.0 found, deviceid %d subdeviceid %d\n",
++		    nvgpu->gpu_id, nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot, info.deviceInstance, info.subDeviceInstance);
++	}
++
++	pfd = malloc(sizeof(struct pollfd) * gpu_count);
++	if (!pfd) {
++		log(ALL, LOG_ERR, "nvgpu alloc error\n");
++		ret =  1;
++		goto free_nvgpu;
++	}
++
++	for (i = 0; i < gpu_count; i++) {
++		pfd[i].fd = nvgpus[i].fd;
++		pfd[i].events = POLLIN | POLLPRI;
++	}
++
++	while (1) {
++		if (poll(pfd, gpu_count, -1) < 0) {
++			log(ALL, LOG_ERR, "nvgpu poll error\n");
++			goto free_pfd;
++		}
++
++		for (i = 0; i < gpu_count; i++) {
++			if (pfd[i].revents & POLLIN) {
++				NvUnixEvent event;
++
++				get_event(&event, nvgpus[i].fd, fd_uvm, root, nvgpus[i].subdevice, 25);
++
++				report_ras_nvgpu_driver(&nvgpus[i], &event);
++			}
++		}
++	}
++
++free_pfd:
++	free(pfd);
++free_nvgpu:
++	for (i = 0; i < gpu_count; i++) {
++		struct ras_nvgpu_driver *nvgpu = &nvgpus[i];
++
++		for (int j = 0; j < NVGPU_EVENT_NUM; j++) {
++			struct ras_nvgpu_event *event = &nvgpu->events[j];
++
++			if (event->event) {
++				set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_DISABLE);
++				free_nvgpu(fd_ctl, root, nvgpus->subdevice, event->event);
++			}
++		}
++		free_event(nvgpu->fd, root, nvgpu->device);
++		if (nvgpu->subdevice)
++			free_nvgpu(fd_ctl, root, nvgpu->device, nvgpu->subdevice);
++		if (nvgpu->device)
++			free_nvgpu(fd_ctl, root, nvgpu->device, 0);
++		if (nvgpu->device)
++			free_nvgpu(fd_ctl, root, root, nvgpu->device);
++		if (nvgpu->fd)
++			close(nvgpu->fd);
++	}
++detach:
++	deattach_id(fd_ctl, root, &detach);
++free_root:
++	free_nvgpu(fd_ctl, root, root, root);
++close:
++	close(fd_ctl);
++close_uvm:
++	close(fd_uvm);
++
++	return ret;
++}
+\ No newline at end of file
+diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c
+index 2758d14..541ff69 100644
+--- a/ras-nvgpu-nvml.c
++++ b/ras-nvgpu-nvml.c
+@@ -14,8 +14,6 @@
+ #include "trace-seq.h"
+ #include "types.h"
+ 
+-#define XID_EVENT_NAME "xid"
+-
+ const char *lib_name[] = {
+ 	"/lib64/libnvidia-ml.so",
+ 	"/lib64/libnvidia-ml.so.1",
+diff --git a/ras-nvgpu.c b/ras-nvgpu.c
+index 5c63279..4d39de2 100644
+--- a/ras-nvgpu.c
++++ b/ras-nvgpu.c
+@@ -43,12 +43,16 @@ void *ras_nvgpu_handle(void *arg)
+ 
+ 	while (retry--) {
+ 		if (ras_nvgpu_nvml_handle()) {
+-			log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry);
+-			sleep(10);
++			log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry);
++			sleep(1);
+ 		}
+ 	}
+ 
+-	log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n");
++	log(ALL, LOG_ERR, "NVGPU nvml handle fail, try for nvgpu driver call\n");
++
++	ras_nvgpu_driver_handle();
++
++	log(ALL, LOG_ERR, "NVGPU driver handle fail, exit nvgpu thread\n");
+ 
+ 	return NULL;
+ }
+diff --git a/ras-nvgpu.h b/ras-nvgpu.h
+index 32827ad..bade7e4 100644
+--- a/ras-nvgpu.h
++++ b/ras-nvgpu.h
+@@ -8,7 +8,9 @@
+ #define __RAS_NVGPU_H
+ 
+ #define NVGPU_EVENT_NAME "nvgpu"
++#define XID_EVENT_NAME "xid"
+ 
+ void *ras_nvgpu_handle(void *arg);
+ int ras_nvgpu_nvml_handle(void);
++int ras_nvgpu_driver_handle(void);
+ #endif
+-- 
+2.43.5
+
diff --git a/1028-anolis-add-trigger-for-nvgpu-event.patch b/1028-anolis-add-trigger-for-nvgpu-event.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8c17b64c66c7f3967258bd89889656f775b6a98f
--- /dev/null
+++ b/1028-anolis-add-trigger-for-nvgpu-event.patch
@@ -0,0 +1,241 @@
+From 67fcdb9008b17555b0ea0d4c791f3ac772ee682c Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 25 Apr 2025 10:20:16 +0800
+Subject: [PATCH 2/3] anolis: add trigger for nvgpu event
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ contrib/nvgpu_trigger | 25 +++++++++++++++++++++++++
+ misc/rasdaemon.env    |  3 +++
+ ras-nvgpu-driver.c    |  7 ++++++-
+ ras-nvgpu-nvml.c      |  8 +++++++-
+ ras-nvgpu.c           |  3 +++
+ trigger.c             | 35 +++++++++++++++++++++++++++++++++++
+ trigger.h             |  1 +
+ 7 files changed, 80 insertions(+), 2 deletions(-)
+ create mode 100755 contrib/nvgpu_trigger
+
+diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger
+new file mode 100755
+index 0000000..48955af
+--- /dev/null
++++ b/contrib/nvgpu_trigger
+@@ -0,0 +1,25 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occured, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# BDF
++# EVENT_TYPE
++# DATA1
++# DATA2
++#
++
++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local
++
++if [ -d nvgpu_trigger.extern ]
++then
++    ls nvgpu_trigger.extern |
++    while read item
++    do
++        [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item
++    done
++fi
++
++exit 0
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 198b050..b08afa6 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
+ KMSG_TRIGGER=
+ KMSG_TRIGGER_TIMEOUT=0
+ 
++NVGPU_TRIGGER=
++NVGPU_TRIGGER_TIMEOUT=0
++
+ # CE Statistic Threshold
+ #
+ # Specify the threshold of CE per second.
+diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c
+index a72a7c5..9093292 100644
+--- a/ras-nvgpu-driver.c
++++ b/ras-nvgpu-driver.c
+@@ -24,6 +24,7 @@
+ 
+ #include "ras-logger.h"
+ #include "ras-nvgpu.h"
++#include "trigger.h"
+ #include <traceevent/trace-seq.h>
+ #define NV_PLATFORM_MAX_IOCTL_SIZE 16384
+ #include "nv.h"
+@@ -238,6 +239,7 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *
+ 	time_t now;
+ 	struct tm *tm;
+ 	char timestamp[64];
++	char tmpbuf[64];
+ 
+ 	time(&now);
+ 	tm = localtime(&now);
+@@ -263,7 +265,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *
+ 
+ 	}
+ 
+-	trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot);
++	snprintf(tmpbuf, sizeof(tmpbuf), "%08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot);
++	trace_seq_printf(&s, "pci_port: %s ", tmpbuf);
+ 
+ 	trace_seq_terminate(&s);
+ 	trace_seq_do_printf(&s);
+@@ -271,6 +274,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *
+ 	fflush(stdout);
+ 	trace_seq_destroy(&s);
+ 
++	run_nvgpu_trigger(tmpbuf, event->NotifyIndex, event->info32, event->info16);
++
+ 	return 0;
+ }
+ 
+diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c
+index 541ff69..f2421a1 100644
+--- a/ras-nvgpu-nvml.c
++++ b/ras-nvgpu-nvml.c
+@@ -4,6 +4,7 @@
+  * Copyright (C) 2025 Alibaba Inc
+  */
+ 
++#include <stdio.h>
+ #include <time.h>
+ #include <unistd.h>
+ #include <stdlib.h>
+@@ -13,6 +14,7 @@
+ #include "ras-nvgpu.h"
+ #include "trace-seq.h"
+ #include "types.h"
++#include "trigger.h"
+ 
+ const char *lib_name[] = {
+ 	"/lib64/libnvidia-ml.so",
+@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 	time_t now;
+ 	struct tm *tm;
+ 	char timestamp[64];
++	char tmpbuf[64];
+ 
+ 	time(&now);
+ 	tm = localtime(&now);
+@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 		trace_seq_printf(&s, "data: %lld ", data->eventData);
+ 	}
+ 
+-	trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci));
++	snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci));
++	trace_seq_printf(&s, "pci_port: %s ", tmpbuf);
+ 	trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId);
+ 	trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId);
+ 
+@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 	fflush(stdout);
+ 	trace_seq_destroy(&s);
+ 
++	run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0);
++
+ 	return 0;
+ }
+ 
+diff --git a/ras-nvgpu.c b/ras-nvgpu.c
+index 4d39de2..37a8833 100644
+--- a/ras-nvgpu.c
++++ b/ras-nvgpu.c
+@@ -15,6 +15,7 @@
+ #include "ras-events.h"
+ #include "ras-logger.h"
+ #include "ras-nvgpu.h"
++#include "trigger.h"
+ void *ras_nvgpu_handle(void *arg)
+ {
+ 	(void)arg;
+@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg)
+ 		return NULL;
+ 	}
+ 
++	setup_event_trigger("nvgpu_event");
++
+ 	while (retry--) {
+ 		if (ras_nvgpu_nvml_handle()) {
+ 			log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry);
+diff --git a/trigger.c b/trigger.c
+index d410137..e113077 100644
+--- a/trigger.c
++++ b/trigger.c
+@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF
+ 
+ struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"};
+ 
++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"};
++
+ static struct event_trigger *event_triggers[] = {
+ 	&mc_ue_trigger,
+ #ifdef HAVE_MCE
+@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = {
+ #ifdef HAVE_KMSG_MONITOR
+ 	&kmsg_trigger,
+ #endif
++#ifdef HAVE_NVGPU
++	&nvgpu_trigger,
++#endif
+ };
+ 
+ void setup_event_trigger(const char *event)
+@@ -476,3 +481,33 @@ free:
+ 		free(env[i]);
+ }
+ 
++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2)
++{
++	char *env[MAX_ENV];
++	int ei = 0;
++	struct event_trigger *trigger = &nvgpu_trigger;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DATA1=%d", data1) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DATA2=%d", data1) < 0)
++		goto free;
++
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (int i = 0; i < ei; i++)
++		free(env[i]);
++}
++
+diff --git a/trigger.h b/trigger.h
+index b5a6c2c..2ea2b09 100644
+--- a/trigger.h
++++ b/trigger.h
+@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e);
+ void run_aer_event_trigger(struct ras_aer_event *e);
+ void run_page_offline_trigger(unsigned long long addr, int otype, int type);
+ void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2);
+ 
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1029-anolis-add-nvgpu-reset-trigger.patch b/1029-anolis-add-nvgpu-reset-trigger.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3cb9e5b02f914e82a05a317a35c5ee4a82b3f171
--- /dev/null
+++ b/1029-anolis-add-nvgpu-reset-trigger.patch
@@ -0,0 +1,76 @@
+From 866c8169c9376f7c0b8a23966caaf099ebbeee9e Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 25 Apr 2025 14:11:30 +0800
+Subject: [PATCH 3/3] anolis: add nvgpu reset trigger
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ contrib/nvgpu_reset_trigger | 40 +++++++++++++++++++++++++++++++++++++
+ contrib/rasdaemon.init      |  4 ++++
+ 2 files changed, 44 insertions(+)
+ create mode 100755 contrib/nvgpu_reset_trigger
+
+diff --git a/contrib/nvgpu_reset_trigger b/contrib/nvgpu_reset_trigger
+new file mode 100755
+index 0000000..769e5e2
+--- /dev/null
++++ b/contrib/nvgpu_reset_trigger
+@@ -0,0 +1,40 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occured, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# BDF
++# EVENT_TYPE
++# DATA1
++# DATA2
++#
++
++[ -x ./nvgpu_reset_trigger.local ] && . ./nvgpu_reset_trigger.local
++
++if [ -d nvgpu_reset_trigger.extern ]
++then
++    ls nvgpu_reset_trigger.extern |
++    while read item
++    do
++        [ -x ./nvgpu_reset_trigger.extern/$item ] && . ./nvgpu_reset_trigger.extern/$item
++    done
++fi
++
++if [ "$EVENT_TYPE" == "8" ] && [ "$DATA1" == "48" ]
++then
++    sudo nvidia-smi -r -i $BDF
++fi
++
++if [ "$EVENT_TYPE" == "2" ]
++then
++    sudo nvidia-smi -r -i $BDF
++fi
++
++if [ "$EVENT_TYPE" == "37" ] && [ "$DATA1" == "48" ]
++then
++    sudo nvidia-smi -r -i $BDF
++fi
++
++exit 0
+diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init
+index d575af9..5fde6c8 100644
+--- a/contrib/rasdaemon.init
++++ b/contrib/rasdaemon.init
+@@ -13,6 +13,10 @@ case "$target" in
+ 		sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH}
+ 		sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH}
+ 		;;
++	nvgpu_reset)
++		sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH}
++		sed -i 's/^NVGPU_TRIGGER=.*/NVGPU_TRIGGER="nvgpu_reset_trigger"/g' ${ENV_PATH}
++		;;
+ 	jituan)
+ 		sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH}
+ 		sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH}
+-- 
+2.43.5
+
diff --git a/1029-anolis-add-trigger-for-nvgpu-event.patch b/1029-anolis-add-trigger-for-nvgpu-event.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7e1c34b13c48eddea6ac10b0e11263c1c6f36ed7
--- /dev/null
+++ b/1029-anolis-add-trigger-for-nvgpu-event.patch
@@ -0,0 +1,201 @@
+From 03cd59d6aafbd14ed29ce2f9a73d0bbd8f8b23d3 Mon Sep 17 00:00:00 2001
+From: Ruidong Tian <tianruidong@linux.alibaba.com>
+Date: Fri, 25 Apr 2025 10:20:16 +0800
+Subject: [PATCH 29/30] anolis: add trigger for nvgpu event
+
+Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
+---
+ contrib/nvgpu_trigger    |  25 +++++++++++++++++++++++++
+ misc/rasdaemon.env       |   3 +++
+ ras-nvgpu-nvml.c         |   8 +++++++-
+ ras-nvgpu.c              |   3 +++
+ trigger.c                |  35 +++++++++++++++++++++++++++++++++++
+ trigger.h                |   1 +
+ 9 files changed, 80 insertions(+), 2 deletions(-)
+ create mode 100755 contrib/nvgpu_trigger
+
+diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger
+new file mode 100755
+index 0000000..48955af
+--- /dev/null
++++ b/contrib/nvgpu_trigger
+@@ -0,0 +1,25 @@
++#!/bin/sh
++# SPDX-License-Identifier: GPL-2.0
++#  This shell script can be executed by rasdaemon in daemon mode when a
++#  memory_failure_event is occured, environment variables include all
++#  information reported by tracepoint.
++
++# environment:
++# BDF
++# EVENT_TYPE
++# DATA1
++# DATA2
++#
++
++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local
++
++if [ -d nvgpu_trigger.extern ]
++then
++    ls nvgpu_trigger.extern |
++    while read item
++    do
++        [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item
++    done
++fi
++
++exit 0
+diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
+index 198b050..b08afa6 100644
+--- a/misc/rasdaemon.env
++++ b/misc/rasdaemon.env
+@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
+ KMSG_TRIGGER=
+ KMSG_TRIGGER_TIMEOUT=0
+ 
++NVGPU_TRIGGER=
++NVGPU_TRIGGER_TIMEOUT=0
++
+ # CE Statistic Threshold
+ #
+ # Specify the threshold of CE per second.
+diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c
+index 541ff69..f2421a1 100644
+--- a/ras-nvgpu-nvml.c
++++ b/ras-nvgpu-nvml.c
+@@ -4,6 +4,7 @@
+  * Copyright (C) 2025 Alibaba Inc
+  */
+ 
++#include <stdio.h>
+ #include <time.h>
+ #include <unistd.h>
+ #include <stdlib.h>
+@@ -13,6 +14,7 @@
+ #include "ras-nvgpu.h"
+ #include "trace-seq.h"
+ #include "types.h"
++#include "trigger.h"
+ 
+ const char *lib_name[] = {
+ 	"/lib64/libnvidia-ml.so",
+@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 	time_t now;
+ 	struct tm *tm;
+ 	char timestamp[64];
++	char tmpbuf[64];
+ 
+ 	time(&now);
+ 	tm = localtime(&now);
+@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 		trace_seq_printf(&s, "data: %lld ", data->eventData);
+ 	}
+ 
+-	trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci));
++	snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci));
++	trace_seq_printf(&s, "pci_port: %s ", tmpbuf);
+ 	trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId);
+ 	trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId);
+ 
+@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices)
+ 	fflush(stdout);
+ 	trace_seq_destroy(&s);
+ 
++	run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0);
++
+ 	return 0;
+ }
+ 
+diff --git a/ras-nvgpu.c b/ras-nvgpu.c
+index 4d39de2..37a8833 100644
+--- a/ras-nvgpu.c
++++ b/ras-nvgpu.c
+@@ -15,6 +15,7 @@
+ #include "ras-events.h"
+ #include "ras-logger.h"
+ #include "ras-nvgpu.h"
++#include "trigger.h"
+ void *ras_nvgpu_handle(void *arg)
+ {
+ 	(void)arg;
+@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg)
+ 		return NULL;
+ 	}
+ 
++	setup_event_trigger("nvgpu_event");
++
+ 	while (retry--) {
+ 		if (ras_nvgpu_nvml_handle()) {
+ 			log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry);
+diff --git a/trigger.c b/trigger.c
+index d410137..e113077 100644
+--- a/trigger.c
++++ b/trigger.c
+@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF
+ 
+ struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"};
+ 
++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"};
++
+ static struct event_trigger *event_triggers[] = {
+ 	&mc_ue_trigger,
+ #ifdef HAVE_MCE
+@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = {
+ #ifdef HAVE_KMSG_MONITOR
+ 	&kmsg_trigger,
+ #endif
++#ifdef HAVE_NVGPU
++	&nvgpu_trigger,
++#endif
+ };
+ 
+ void setup_event_trigger(const char *event)
+@@ -476,3 +481,33 @@ free:
+ 		free(env[i]);
+ }
+ 
++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2)
++{
++	char *env[MAX_ENV];
++	int ei = 0;
++	struct event_trigger *trigger = &nvgpu_trigger;
++
++	if (!trigger->path || !strcmp(trigger->path, ""))
++		return;
++
++	if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
++		goto free;
++	if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DATA1=%d", data1) < 0)
++		goto free;
++	if (asprintf(&env[ei++], "DATA2=%d", data1) < 0)
++		goto free;
++
++	env[ei] = NULL;
++	assert(ei < MAX_ENV);
++
++	run_trigger(trigger, NULL, env);
++
++free:
++	for (int i = 0; i < ei; i++)
++		free(env[i]);
++}
++
+diff --git a/trigger.h b/trigger.h
+index b5a6c2c..2ea2b09 100644
+--- a/trigger.h
++++ b/trigger.h
+@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e);
+ void run_aer_event_trigger(struct ras_aer_event *e);
+ void run_page_offline_trigger(unsigned long long addr, int otype, int type);
+ void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2);
+ 
+ 
+ #endif
+-- 
+2.43.5
+
diff --git a/1030-fix-build-error-of-some-variable-undefine.patch b/1030-fix-build-error-of-some-variable-undefine.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c13f203c1524f81e34a219a89a81d70373e2c4c6
--- /dev/null
+++ b/1030-fix-build-error-of-some-variable-undefine.patch
@@ -0,0 +1,27 @@
+From e2c1a3ce09f74e6de2ea8bb710b51babf7645376 Mon Sep 17 00:00:00 2001
+From: happy_orange <songnannan.snn@alibaba-inc.com>
+Date: Fri, 6 Jun 2025 14:42:13 +0800
+Subject: [PATCH 1/1] fix build error of some variable undefine
+
+---
+ ras-pcie-edpc.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/ras-pcie-edpc.c b/ras-pcie-edpc.c
+index 4731b05..53d93ed 100644
+--- a/ras-pcie-edpc.c
++++ b/ras-pcie-edpc.c
+@@ -41,8 +41,8 @@ static bool is_cxl_mem_or_cache(struct pci_dev *dev)
+ 	if (vendor != PCI_DVSEC_VENDOR_ID_CXL || id != PCI_DVSEC_ID_CXL)
+ 		return false;
+ 
+-	cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_CAP);
+-	if (cxl_cap & (PCI_CXL_CAP_CACHE | PCI_CXL_CAP_MEM))
++	cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_DEV_CAP);
++	if (cxl_cap & (PCI_CXL_DEV_CAP_CACHE | PCI_CXL_DEV_CAP_MEM))
+ 		return true;
+ 
+ 	return false;
+-- 
+2.43.5
+
diff --git a/dist b/dist
new file mode 100644
index 0000000000000000000000000000000000000000..ffd87663ad49340de9f1a1f342206406d2ba1712
--- /dev/null
+++ b/dist
@@ -0,0 +1 @@
+an23
diff --git a/rasdaemon-0.8.0.tar.bz2 b/rasdaemon-0.8.0.tar.bz2
deleted file mode 100644
index 8837c6ab83ad422135d477bf4e3d36d88a8eb1ce..0000000000000000000000000000000000000000
Binary files a/rasdaemon-0.8.0.tar.bz2 and /dev/null differ
diff --git a/rasdaemon-0.8.3.tar.bz2 b/rasdaemon-0.8.3.tar.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..bc6bb41df46c10f3f3b80d222fd311a445e1884c
Binary files /dev/null and b/rasdaemon-0.8.3.tar.bz2 differ
diff --git a/rasdaemon.spec b/rasdaemon.spec
index 0a0057f8f1529cb4507fd777673eec54de1495e7..a574639cfbee32d6f579329e2d9119fb98d049f2 100644
--- a/rasdaemon.spec
+++ b/rasdaemon.spec
@@ -1,22 +1,71 @@
-%define anolis_release 1
-
+%define anolis_release  2
 Name:			rasdaemon
-Version:		0.8.0
+Version:		0.8.3
 Release:		%{anolis_release}%{?dist}
 Summary:		Utility to receive RAS error tracings
-Group:			Applications/System
-License:		GPLv2
+License:		GPL-2.0-only
 URL:			http://git.infradead.org/users/mchehab/rasdaemon.git
 Source0:		http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
+Patch1001: 1001-config-add-syslog-ng-and-logrotate-config.patch
+Patch1002: 1002-config-add-rsyslog-config.patch
+Patch1003: 1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch
+Patch1004: 1004-rasdaemon-align-event-name-in-log.patch
+Patch1005: 1005-rasdaemon-skip-doesn-t-exist-event.patch
+Patch1006: 1006-rasdaemon-support-memory-corrected-error-statistics.patch
+Patch1007: 1007-rasdaemon-introduce-poison-page-statistics.patch
+Patch1008: 1008-rasdaemon-erst-decode-panic-mce-through-erst.patch
+Patch1009: 1009-aer-print-pci-device-name-and-vendor-device-id.patch
+Patch1010: 1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch
+Patch1011: 1011-rasdaemon-support-nvgpu-event.patch
+Patch1012: 1012-rasdaemon-enhance-rasdaemon-event-trigger.patch
+Patch1013: 1013-rasdaemon-add-event-level-for-event-record.patch
+Patch1014: 1014-anolis-syslog-add-rasdaemon.ext.patch
+Patch1015: 1015-rasdaemon-add-page-offline-trigger.patch
+Patch1016: 1016-anolis-compta-rasdaemon-notices.patch
+Patch1017: 1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch
+Patch1018: 1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch
+Patch1019: 1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch
+Patch1020: 1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch
+Patch1021: 1021-anolis-config-disable-page-offline-defalut.patch
+Patch1022: 1022-anolis-disable-block-and-dev-error-default.patch
+Patch1023: 1023-anolis-add-nvml-in-tree.patch
+Patch1024: 1024-anolis-do-not-print-teq-error.patch
+Patch1025: 1025-anolis-add-init.sh-for-different-user.patch
+Patch1026: 1026-anolis-fix-systemd-config.patch
+Patch1027: 1027-anolis-add-nvgpu-driver.patch
+Patch1028: 1028-anolis-add-trigger-for-nvgpu-event.patch
+Patch1029: 1029-anolis-add-nvgpu-reset-trigger.patch
+Patch1030: 1030-fix-build-error-of-some-variable-undefine.patch
 
-BuildRequires:		make gcc autoconf automake libtool perl-generators
-BuildRequires:		gettext-devel sqlite-devel libtraceevent-devel systemd-rpm-macros
+ExcludeArch:		s390 s390x
+BuildRequires:		make
+BuildRequires:		gcc
+BuildRequires:		gettext-devel
+BuildRequires:		perl-generators
+BuildRequires:		sqlite-devel
+BuildRequires:		systemd
+BuildRequires:		autoconf
+BuildRequires:		automake
+BuildRequires:		libtool
+BuildRequires:		libtraceevent-devel
+BuildRequires:		pciutils-devel
+BuildRequires:		zlib-devel
+BuildRequires:		python3
+BuildRequires:		rasdaemon-open-gpu-kernel-modules
 Provides:		bundled(kernel-event-lib)
-Requires:		hwdata perl-DBD-SQLite libtraceevent
-%ifarch x86_64
+Requires:		hwdata
+Requires:		perl-DBD-SQLite
+Requires:		libtraceevent
+Requires:		pciutils
+Requires:		zlib
+%ifarch %{ix86} x86_64
 Requires:		dmidecode
 %endif
 
+Requires(post):		systemd
+Requires(preun):	systemd
+Requires(postun):	systemd
+
 %description
 %{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
 It currently records memory errors, using the EDAC tracing events.
@@ -37,45 +86,147 @@ Doc files for %{name}
 
 %prep
 %setup -q
-autoreconf -vfi
+tar -xf /usr/share/rasdaemon-open-gpu-kernel-modules/*.tar.gz
+mv open-gpu-kernel-modules-* open-gpu-kernel-modules
+%patch1001 -p1
+%patch1002 -p1
+%patch1003 -p1
+%patch1004 -p1
+%patch1005 -p1
+%patch1006 -p1
+%patch1007 -p1
+%patch1008 -p1
+%patch1009 -p1
+%patch1010 -p1
+%patch1011 -p1
+%patch1012 -p1
+%patch1013 -p1
+%patch1014 -p1
+%patch1015 -p1
+%patch1016 -p1
+%patch1017 -p1
+%patch1018 -p1
+%patch1019 -p1
+%patch1020 -p1
+%patch1021 -p1
+%patch1022 -p1
+%patch1023 -p1
+%patch1024 -p1
+%patch1025 -p1
+%patch1026 -p1
+%patch1027 -p1
+%patch1028 -p1
+%patch1029 -p1
+%patch1030 -p1
+
 
 %build
-%ifarch aarch64
-%configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm \
-	   --enable-mce --enable-extlog --enable-devlink --enable-diskerror \
-	   --enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \
-	   --enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \
-	   --with-sysconfdefdir=%{_sysconfdir}/sysconfig
+%ifarch loongarch64
+%configure --enable-sqlite3 --enable-aer --enable-non-standard \
+           --enable-devlink --enable-diskerror \
+           --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \
+           --enable-cxl --enable-json-report --enable-memory-ce-pfa --enable-memory-row-ce-pfa \
+           --enable-signal --enable-erst --enable-kmsg-monitor \
+           --with-sysconfdefdir=%{_sysconfdir}/sysconfig
 %else
-%configure --enable-sqlite3 --enable-aer \
-	   --enable-mce --enable-extlog --enable-devlink --enable-diskerror \
-	   --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \
-	   --with-sysconfdefdir=%{_sysconfdir}/sysconfig
+%configure --enable-all --with-sysconfdefdir=%{_sysconfdir}/sysconfig
 %endif
-%make_build
+make %{?_smp_mflags}
 
 %install
 make install DESTDIR=%{buildroot}
-install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service
+install -D -p -m 0644 misc/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
 install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
-install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
-rm %{buildroot}/usr/include/*.h
-%generate_compatibility_deps
+install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
+install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng
+install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate
+install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog
+install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext
+install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext
+install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/
+install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/
+install -D -p -m 0755 contrib/%{name}.init %{buildroot}/usr/share/%{name}/%{name}.init
+rm INSTALL %{buildroot}/usr/include/*.h
 
 %files
-%license COPYING
-%{_sbindir}/rasdaemon
+%doc AUTHORS ChangeLog COPYING TODO
+%{_sbindir}/%{name}
 %{_sbindir}/ras-mc-ctl
 %{_mandir}/*/*
 %{_unitdir}/*.service
 %{_sysconfdir}/ras/dimm_labels.d
-%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
-%dir %{abidir}
-%{abidir}/rasdaemon-option.list
+%{_sysconfdir}/ras/*/*
+%{_sysconfdir}/sysconfig/%{name}
+/usr/share/%{name}/%{name}.syslog-ng
+/usr/share/%{name}/%{name}.logrotate
+/usr/share/%{name}/%{name}.rsyslog
+/usr/share/%{name}/%{name}.syslog-ng-ext
+/usr/share/%{name}/%{name}.rsyslog-ext
+/usr/share/%{name}/%{name}.init
+%{_sysconfdir}/rasdaemon_notices/*
+
+%post
+if systemctl is-active --quiet syslog-ng.service; then
+    echo "Syslog service is enabled and running, create config file and restart it";
+    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+    ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
+    ln -s /usr/share/%{name}/%{name}.syslog-ng-ext %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
+    systemctl restart syslog-ng.service;
+fi
+if systemctl is-active --quiet rsyslog.service; then
+    echo "Rsyslog service is enabled and running, create config file and restart it";
+    rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
+    ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf;
+    rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
+    ln -s /usr/share/%{name}/%{name}.rsyslog-ext %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
+    systemctl restart rsyslog.service;
+fi
+if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+    rm -rf %{_sysconfdir}/logrotate.d/%{name};
+    ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name};
+fi
+if ! systemctl is-enabled --quiet %{name}.service; then
+    echo "Rasdaemon service is not enabled, enable it";
+    systemctl enable %{name}.service;
+fi
+echo "Rasdaemon install for ${RASDAEMON_TARGET}";
+/usr/share/%{name}/%{name}.init ${RASDAEMON_TARGET}
+
+systemctl daemon-reload
+systemctl restart %{name}.service
+
+%preun
+systemctl stop %{name}.service
+systemctl disable %{name}.service
+
+%postun
+if systemctl is-active --quiet syslog-ng.service; then
+    echo "Syslog-ng service is enabled and running, delete config file and restart it";
+    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf;
+    rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf;
+    systemctl restart syslog-ng.service;
+fi
+if systemctl is-active --quiet rsyslog.service; then
+    echo "Rsyslog service is enabled and running, delete config file and restart it";
+    rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf;
+    rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf;
+    systemctl restart rsyslog.service;
+fi
+if [ -d "%{_sysconfdir}/logrotate.d" ]; then
+    rm -rf %{_sysconfdir}/logrotate.d/%{name};
+fi
 
 %files doc
-%doc AUTHORS ChangeLog README.md TODO INSTALL
+%doc AUTHORS ChangeLog README.md TODO
 
 %changelog
+* Thu Mar 20 2025 wangzhe <wanglan.wz@alibaba-inc.com> - 0.8.3-2
+- update to 0.8.3
+- support mc event stat
+- support poison stat
+- support log level
+- support nvgpu event
+
 * Fri Apr 07 2023 Chunmei Xu <xuchunmei@linux.alibaba.com> - 0.8.0-1
 - init from upstream