From 8a45af2cb7a1f5fe73b58b330699016191fd0a9e Mon Sep 17 00:00:00 2001
From: peixiangwang <16574146+peixiangwang@user.noreply.gitee.com>
Date: Fri, 16 Jan 2026 10:20:40 +0800
Subject: [PATCH 01/59] add new driver for dn200 NIC

---
 arch/arm64/configs/tencent.config             |    42 +-
 arch/x86/configs/tencent.config               |    10 +-
 drivers/net/ethernet/Kconfig                  |     1 +
 drivers/net/ethernet/Makefile                 |     1 +
 drivers/net/ethernet/dapustor/Kconfig         |    30 +
 drivers/net/ethernet/dapustor/Makefile        |     6 +
 drivers/net/ethernet/dapustor/dn200/Makefile  |    15 +
 drivers/net/ethernet/dapustor/dn200/common.h  |   695 +
 drivers/net/ethernet/dapustor/dn200/descs.h   |   163 +
 drivers/net/ethernet/dapustor/dn200/dn200.h   |   685 +
 .../net/ethernet/dapustor/dn200/dn200_cfg.h   |   289 +
 .../net/ethernet/dapustor/dn200/dn200_ctrl.c  |  2170 ++++
 .../net/ethernet/dapustor/dn200/dn200_ctrl.h  |   431 +
 .../net/ethernet/dapustor/dn200/dn200_dcb.c   |   913 ++
 .../net/ethernet/dapustor/dn200/dn200_dcb.h   |    16 +
 .../net/ethernet/dapustor/dn200/dn200_eprom.c |   221 +
 .../net/ethernet/dapustor/dn200/dn200_eprom.h |    80 +
 .../ethernet/dapustor/dn200/dn200_ethtool.c   |  2170 ++++
 .../ethernet/dapustor/dn200/dn200_hwtstamp.c  |   174 +
 .../net/ethernet/dapustor/dn200/dn200_iatu.c  |   855 ++
 .../net/ethernet/dapustor/dn200/dn200_iatu.h  |   106 +
 .../net/ethernet/dapustor/dn200/dn200_main.c  | 10515 ++++++++++++++++
 .../net/ethernet/dapustor/dn200/dn200_mdio.c  |   383 +
 .../net/ethernet/dapustor/dn200/dn200_pci.c   |  1602 +++
 .../net/ethernet/dapustor/dn200/dn200_phy.h   |   716 ++
 .../ethernet/dapustor/dn200/dn200_phy_impl.c  |  4227 +++++++
 .../net/ethernet/dapustor/dn200/dn200_pool.c  |   669 +
 .../net/ethernet/dapustor/dn200/dn200_pool.h  |   822 ++
 .../net/ethernet/dapustor/dn200/dn200_prod.h  |   551 +
 .../net/ethernet/dapustor/dn200/dn200_ptp.c   |   342 +
 .../net/ethernet/dapustor/dn200/dn200_ptp.h   |    84 +
 .../net/ethernet/dapustor/dn200/dn200_reg.c   |    96 +
 .../net/ethernet/dapustor/dn200/dn200_reg.h   |    24 +
 .../net/ethernet/dapustor/dn200/dn200_self.h  |   234 +
 .../ethernet/dapustor/dn200/dn200_selftests.c |  1108 ++
 .../ethernet/dapustor/dn200/dn200_spec_acc.c  |    83 +
 .../ethernet/dapustor/dn200/dn200_spec_acc.h  |    20 +
 .../ethernet/dapustor/dn200/dn200_spec_def.h  |    74 +
 .../net/ethernet/dapustor/dn200/dn200_sriov.c |   907 ++
 .../net/ethernet/dapustor/dn200/dn200_sriov.h |   571 +
 .../ethernet/dapustor/dn200/dwxgmac2_core.c   |  4457 +++++++
 .../ethernet/dapustor/dn200/dwxgmac2_descs.c  |   461 +
 .../ethernet/dapustor/dn200/dwxgmac2_dma.c    |   771 ++
 .../ethernet/dapustor/dn200/dwxgmac_comm.h    |   630 +
 .../net/ethernet/dapustor/dn200/extern_phy.c  |   392 +
 drivers/net/ethernet/dapustor/dn200/hwif.c    |   210 +
 drivers/net/ethernet/dapustor/dn200/hwif.h    |   778 ++
 drivers/net/ethernet/dapustor/dn200/mmc.h     |   146 +
 .../net/ethernet/dapustor/dn200/mmc_core.c    |   378 +
 .../net/ethernet/dapustor/dn200/ring_mode.c   |   134 +
 50 files changed, 40417 insertions(+), 41 deletions(-)
 create mode 100644 drivers/net/ethernet/dapustor/Kconfig
 create mode 100644 drivers/net/ethernet/dapustor/Makefile
 create mode 100644 drivers/net/ethernet/dapustor/dn200/Makefile
 create mode 100644 drivers/net/ethernet/dapustor/dn200/common.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/descs.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_cfg.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_ctrl.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_ctrl.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_dcb.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_dcb.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_eprom.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_eprom.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_ethtool.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_hwtstamp.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_iatu.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_iatu.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_main.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_mdio.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_pci.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_phy.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_phy_impl.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_pool.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_pool.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_prod.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_ptp.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_ptp.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_reg.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_reg.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_self.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_selftests.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_spec_def.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_sriov.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dn200_sriov.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dwxgmac2_core.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dwxgmac2_descs.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dwxgmac2_dma.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/dwxgmac_comm.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/extern_phy.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/hwif.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/hwif.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/mmc.h
 create mode 100644 drivers/net/ethernet/dapustor/dn200/mmc_core.c
 create mode 100644 drivers/net/ethernet/dapustor/dn200/ring_mode.c

diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config
index 92b0cf9cfe42..3c8c2e0f024f 100644
--- a/arch/arm64/configs/tencent.config
+++ b/arch/arm64/configs/tencent.config
@@ -2,8 +2,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_WATCH_QUEUE=y
-CONFIG_NO_HZ_FULL=y
-CONFIG_CONTEXT_TRACKING_USER_FORCE=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BPF_SYSCALL=y
@@ -70,13 +68,13 @@ CONFIG_ARCH_THUNDER=y
 CONFIG_ARCH_THUNDER2=y
 CONFIG_ARCH_VEXPRESS=y
 CONFIG_ARCH_XGENE=y
+CONFIG_HISILICON_ERRATUM_162102203=y
 CONFIG_ARM64_VA_BITS_48=y
 CONFIG_SCHED_MC=y
 CONFIG_SCHED_CLUSTER=y
 CONFIG_SCHED_SMT=y
 CONFIG_NR_CPUS=1024
 CONFIG_NUMA=y
-CONFIG_NUMA_EMU=y
 CONFIG_NODES_SHIFT=8
 CONFIG_NUMA_AWARE_SPINLOCKS=y
 CONFIG_PARAVIRT_TIME_ACCOUNTING=y
@@ -162,7 +160,6 @@ CONFIG_DEVICE_PRIVATE=y
 CONFIG_USERFAULTFD=y
 CONFIG_LRU_GEN=y
 CONFIG_LRU_GEN_ENABLED=y
-CONFIG_ARM64_HAFT=y
 CONFIG_DAMON=y
 CONFIG_DAMON_VADDR=y
 CONFIG_DAMON_PADDR=y
@@ -602,7 +599,6 @@ CONFIG_PCI_PRI=y
 CONFIG_PCI_PASID=y
 CONFIG_PCI_HYPERV=m
 CONFIG_VGA_ARB_MAX_GPUS=64
-CONFIG_HOTPLUG_SMT=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_ACPI=y
 CONFIG_HOTPLUG_PCI_ACPI_IBM=m
@@ -615,7 +611,6 @@ CONFIG_CXL_BUS=m
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_HISILICON_LPC=y
-CONFIG_HISILICON_ERRATUM_165010801=y
 CONFIG_FSL_MC_BUS=y
 CONFIG_CONNECTOR=y
 CONFIG_ARM_SCMI_PROTOCOL=m
@@ -814,6 +809,7 @@ CONFIG_CHELSIO_T4VF=m
 CONFIG_CHELSIO_IPSEC_INLINE=m
 CONFIG_CHELSIO_TLS_DEVICE=m
 CONFIG_ENIC=m
+CONFIG_DN200=m
 CONFIG_DNET=m
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
@@ -851,10 +847,10 @@ CONFIG_MLX5_EN_TLS=y
 CONFIG_MLXSW_CORE=m
 CONFIG_MLXBF_GIGE=m
 # CONFIG_NET_VENDOR_MICREL is not set
-CONFIG_MYRI10GE=m
-# CONFIG_NET_VENDOR_NATSEMI is not set
 CONFIG_MXGBE=m
 CONFIG_MGBE=m
+CONFIG_MYRI10GE=m
+# CONFIG_NET_VENDOR_NATSEMI is not set
 CONFIG_NFP=m
 # CONFIG_NET_VENDOR_NVIDIA is not set
 CONFIG_ETHOC=m
@@ -1423,8 +1419,6 @@ CONFIG_MLX5_VFIO_PCI=m
 CONFIG_VFIO_PLATFORM=m
 CONFIG_VFIO_FSL_MC=m
 CONFIG_VIRT_DRIVERS=y
-CONFIG_TSM_REPORTS=m
-CONFIG_ARM_CCA_GUEST=m
 CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_VDPA=m
 CONFIG_VIRTIO_BALLOON=m
@@ -1464,6 +1458,8 @@ CONFIG_ARM_SMMU_V3_SVA=y
 CONFIG_VIRTIO_IOMMU=m
 CONFIG_DPAA2_CONSOLE=m
 CONFIG_KUNPENG_HCCS=m
+CONFIG_HISI_SOC_CACHE=m
+CONFIG_HISI_SOC_HHA=m
 CONFIG_EXTCON=m
 CONFIG_EXTCON_GPIO=m
 CONFIG_NTB=m
@@ -1491,6 +1487,8 @@ CONFIG_DEV_DAX=m
 CONFIG_NVMEM_LAYERSCAPE_SFP=m
 CONFIG_TEE=m
 CONFIG_MUX_GPIO=m
+CONFIG_CPU_INSPECT=m
+CONFIG_CPU_INSPECTOR_ATF=m
 CONFIG_EXT2_FS=y
 CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_POSIX_ACL=y
@@ -1616,7 +1614,6 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_UTF8=m
 CONFIG_TRUSTED_KEYS=y
-CONFIG_SECURITY=y
 CONFIG_SECURITY_INFINIBAND=y
 CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
@@ -1759,26 +1756,3 @@ CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_MEMTEST=y
-CONFIG_HISI_SOC_CACHE=m
-CONFIG_HISI_SOC_HHA=m
-CONFIG_SDEI_WATCHDOG=y
-CONFIG_ARM64_NMI=y
-CONFIG_HARDLOCKUP_DETECTOR_PERF=y
-CONFIG_ARM64_BRBE=y
-#
-# CPU Inspect
-#
-CONFIG_CPU_INSPECT=m
-CONFIG_CPU_INSPECTOR_ATF=m
-# end of CPU Inspect
-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
-CONFIG_HISILICON_ERRATUM_162102203=y
-
-#
-# ARMv8.6 architectural features
-#
-CONFIG_ARM64_TWED=y
-# end of ARMv8.6 architectural features
-
-CONFIG_HISILICON_ERRATUM_162100801=y
-CONFIG_HISILICON_ERRATUM_162100803=y
\ No newline at end of file
diff --git a/arch/x86/configs/tencent.config b/arch/x86/configs/tencent.config
index 87730a59d1d1..d99539d3db0a 100644
--- a/arch/x86/configs/tencent.config
+++ b/arch/x86/configs/tencent.config
@@ -72,7 +72,6 @@ CONFIG_PARAVIRT_TIME_ACCOUNTING=y
 CONFIG_JAILHOUSE_GUEST=y
 CONFIG_INTEL_TDX_GUEST=y
 CONFIG_USING_FPU_IN_KERNEL_NONATOMIC=y
-CONFIG_X86_HYGON_LMC_SSE2_ON=y
 CONFIG_GART_IOMMU=y
 CONFIG_MAXSMP=y
 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
@@ -886,6 +885,7 @@ CONFIG_LIQUIDIO_VF=m
 CONFIG_CHELSIO_T4VF=m
 CONFIG_CHELSIO_IPSEC_INLINE=m
 CONFIG_CHELSIO_TLS_DEVICE=m
+CONFIG_DN200=m
 CONFIG_DNET=m
 CONFIG_NET_TULIP=y
 CONFIG_DE2104X=m
@@ -1775,6 +1775,7 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=m
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
+CONFIG_EXT4_FS=y
 CONFIG_REISERFS_FS=m
 CONFIG_REISERFS_PROC_INFO=y
 CONFIG_REISERFS_FS_XATTR=y
@@ -1802,8 +1803,8 @@ CONFIG_QFMT_V2=y
 CONFIG_AUTOFS_FS=y
 CONFIG_FUSE_FS=m
 CONFIG_CUSE=m
-CONFIG_FUSE_CONN_ALIVE=y
 CONFIG_VIRTIO_FS=m
+CONFIG_FUSE_CONN_ALIVE=y
 CONFIG_OVERLAY_FS=m
 CONFIG_OVERLAY_FS_INDEX=y
 CONFIG_OVERLAY_FS_METACOPY=y
@@ -1879,11 +1880,8 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_TRUSTED_KEYS=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_INFINIBAND=y
 CONFIG_SECURITY_NETWORK_XFRM=y
-CONFIG_SECURITY_PATH=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_YAMA=y
@@ -1971,7 +1969,6 @@ CONFIG_CRYPTO_DEV_CHELSIO=m
 CONFIG_CRYPTO_DEV_VIRTIO=m
 CONFIG_CRYPTO_DEV_TSSE=m
 CONFIG_CORDIC=m
-CONFIG_CRC16=y
 CONFIG_CRC7=m
 CONFIG_PRINTK_TIME=y
 CONFIG_PRINTK_CALLER=y
@@ -2020,4 +2017,3 @@ CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_ASYNC_RAID6_TEST=m
 CONFIG_TEST_KSTRTOX=y
 CONFIG_TEST_BPF=m
-CONFIG_EXT4_FS=y
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index d8caa9d4d717..2f704cec6b86 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -57,6 +57,7 @@ source "drivers/net/ethernet/chelsio/Kconfig"
 source "drivers/net/ethernet/cirrus/Kconfig"
 source "drivers/net/ethernet/cisco/Kconfig"
 source "drivers/net/ethernet/cortina/Kconfig"
+source "drivers/net/ethernet/dapustor/Kconfig"
 source "drivers/net/ethernet/davicom/Kconfig"
 
 config DNET
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 018c1ebbfd7b..582c31871771 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_NET_VENDOR_CORTINA) += cortina/
 obj-$(CONFIG_CX_ECAT) += ec_bhf.o
 obj-$(CONFIG_DM9000) += davicom/
 obj-$(CONFIG_DNET) += dnet.o
+obj-$(CONFIG_NET_VENDOR_DAPUSTOR) += dapustor/
 obj-$(CONFIG_NET_VENDOR_DEC) += dec/
 obj-$(CONFIG_NET_VENDOR_DLINK) += dlink/
 obj-$(CONFIG_NET_VENDOR_EMULEX) += emulex/
diff --git a/drivers/net/ethernet/dapustor/Kconfig b/drivers/net/ethernet/dapustor/Kconfig
new file mode 100644
index 000000000000..c6c4ed073859
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/Kconfig
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Dapustor network device configuration
+#
+
+config NET_VENDOR_DAPUSTOR
+	bool "Dapustor devices"
+	default y
+	help
+	  If you have a network (Ethernet) card from Dapustor(R), say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Dapustor(R) cards. If you say Y, you will
+	  be asked for your specific card in the following questions.
+
+if NET_VENDOR_DAPUSTOR
+
+config DN200
+	tristate "Dapustor(R) DN200 PCI Express adapters support"
+	depends on PCI
+	select MII
+	help
+	  This driver supports Dapustor(R) DN200 PCI Express family of
+	  adapters.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called dn200.
+
+endif #NET_VENDOR_DAPUSTOR
diff --git a/drivers/net/ethernet/dapustor/Makefile b/drivers/net/ethernet/dapustor/Makefile
new file mode 100644
index 000000000000..004168754fae
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Dapustor network device drivers.
+#
+
+obj-$(CONFIG_DN200) += dn200/
diff --git a/drivers/net/ethernet/dapustor/dn200/Makefile b/drivers/net/ethernet/dapustor/dn200/Makefile
new file mode 100644
index 000000000000..c15925c08de3
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# Copyright (c) 2024, DapuStor Corporation.
+
+obj-m := dn200.o
+dn200-objs :=   \
+              dn200_main.o dn200_ethtool.o dn200_mdio.o ring_mode.o	\
+              mmc_core.o dn200_hwtstamp.o dn200_ptp.o \
+              hwif.o \
+              dwxgmac2_core.o dwxgmac2_dma.o dwxgmac2_descs.o \
+              dn200_selftests.o dn200_pci.o\
+              dn200_sriov.o dn200_ctrl.o \
+              dn200_phy_impl.o dn200_spec_acc.o \
+              dn200_dcb.o dn200_eprom.o dn200_iatu.o dn200_pool.o dn200_reg.o \
+              extern_phy.o
diff --git a/drivers/net/ethernet/dapustor/dn200/common.h b/drivers/net/ethernet/dapustor/dn200/common.h
new file mode 100644
index 000000000000..703c01423e8b
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/common.h
@@ -0,0 +1,695 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/module.h>
+#define DN200_VLAN_TAG_USED
+#include <linux/if_vlan.h>
+#include "dn200_cfg.h"
+#include "descs.h"
+#include "hwif.h"
+#include "mmc.h"
+
+#define DN200_DCB_FEATURE_DISABLE	0
+/*Total xmgac number*/
+#define XGE_NUM	4
+
+/* Chip Core versions */
+#define	DWMAC_CORE_3_40		0x34
+#define	DWMAC_CORE_3_50		0x35
+#define	DWMAC_CORE_4_00		0x40
+#define DWMAC_CORE_4_10		0x41
+#define DWMAC_CORE_5_00		0x50
+#define DWMAC_CORE_5_10		0x51
+#define DWMAC_CORE_5_20		0x52
+#define DWXGMAC_CORE_2_10	0x21
+#define DWXLGMAC_CORE_2_00	0x20
+
+/* Device ID */
+#define DWXGMAC_ID		0x76
+#define DWXLGMAC_ID		0x27
+
+/* TX and RX Descriptor Length, these need to be power of two.
+ * TX descriptor length less than 64 may cause transmit queue timed out error.
+ * RX descriptor length less than 64 may cause inconsistent Rx chain error.
+ */
+#define DMA_MIN_TX_SIZE		64
+#define DMA_MAX_TX_SIZE		4096
+#define DMA_DEFAULT_TX_SIZE	2048
+#define DMA_MIN_RX_SIZE		64
+#define DMA_MAX_RX_SIZE \
+	8192 // 16 rings, 8MB pkt buf per ring(2KB * 8192 * 2), total cost 256MB per port
+#define DMA_DEFAULT_RX_SIZE	2048
+#define DMA_DEFAULT_VF_RX_SIZE	512
+#define DN200_GET_ENTRY(x, size) (((x) + 1) & ((size) - 1))
+#define DN200_GET_PREVENTRY(x, size) (((x) + (size) - 1) & ((size) - 1))
+
+#undef FRAME_FILTER_DEBUG
+/* #define FRAME_FILTER_DEBUG */
+
+/* Error messages */
+#define DN200_PCIE_BAR_ERR "Access PCIe Bar failed! Please check HW or PCIe!"
+
+#define DN200_FW_ERR_MSG "ADMIN COMMAND failed! Please check FW!"
+
+enum DRV_TYPE {
+	DRV_PURE_PF = 0,
+	DRV_SRIOV_PF,
+	DRV_VF,
+
+	DRV_TYPE_MAX
+};
+
+struct dn200_txq_stats {
+	u64 tx_pkt_n;
+	u64 tx_normal_irq_n;
+};
+
+struct dn200_rxq_stats {
+	u64 rx_pkt_n;
+	u64 rx_normal_irq_n;
+};
+
+/* Extra statistic and debug information exposed by ethtool */
+struct dn200_extra_stats {
+	/* Transmit errors */
+	u64 tx_underflow ____cacheline_aligned;
+	u64 tx_carrier;
+	u64 tx_losscarrier;
+	u64 vlan_tag;
+	u64 tx_deferred;
+	u64 tx_vlan;
+	u64 tx_jabber;
+	u64 tx_frame_flushed;
+	u64 tx_payload_error;
+	u64 tx_ip_header_error;
+	/* Receive errors */
+	u64 rx_desc;
+	u64 sa_filter_fail;
+	u64 overflow_error;
+	u64 ipc_csum_error;
+	u64 rx_collision;
+	u64 rx_crc_errors;
+	u64 dribbling_bit;
+	u64 rx_length;
+	u64 rx_mii;
+	u64 rx_multicast;
+	u64 rx_gmac_overflow;
+	u64 rx_watchdog;
+	u64 da_rx_filter_fail;
+	u64 sa_rx_filter_fail;
+	u64 rx_missed_cntr;
+	u64 rx_overflow_cntr;
+	u64 rx_vlan;
+	u64 rx_split_hdr_pkt_n;
+	u64 rx_csum_err;
+	/* Tx/Rx IRQ error info */
+	u64 tx_undeflow_irq;
+	u64 tx_process_stopped_irq;
+	u64 tx_jabber_irq;
+	u64 rx_overflow_irq;
+	u64 rx_buf_unav_irq;
+	u64 rx_process_stopped_irq;
+	u64 rx_watchdog_irq;
+	u64 tx_early_irq;
+	u64 fatal_bus_error_irq;
+	/* Tx/Rx IRQ Events */
+	u64 rx_early_irq;
+	u64 tx_pkt_n;
+	u64 rx_pkt_n;
+	u64 normal_irq_n;
+	u64 rx_normal_irq_n;
+	u64 napi_poll;
+	u64 tx_normal_irq_n;
+	u64 tx_clean;
+	u64 tx_set_ic_bit;
+	u64 irq_receive_pmt_irq_n;
+	/* MMC info */
+	u64 mmc_tx_irq_n;
+	u64 mmc_rx_irq_n;
+	u64 mmc_rx_csum_offload_irq_n;
+	/* EEE */
+	u64 irq_tx_path_in_lpi_mode_n;
+	u64 irq_tx_path_exit_lpi_mode_n;
+	u64 irq_rx_path_in_lpi_mode_n;
+	u64 irq_rx_path_exit_lpi_mode_n;
+	u64 phy_eee_wakeup_error_n;
+	/* Extended RDES status */
+	u64 ip_hdr_err;
+	u64 ip_payload_err;
+	u64 ip_csum_bypassed;
+	u64 ipv4_pkt_rcvd;
+	u64 ipv6_pkt_rcvd;
+	u64 no_ptp_rx_msg_type_ext;
+	u64 ptp_rx_msg_type_sync;
+	u64 ptp_rx_msg_type_follow_up;
+	u64 ptp_rx_msg_type_delay_req;
+	u64 ptp_rx_msg_type_delay_resp;
+	u64 ptp_rx_msg_type_pdelay_req;
+	u64 ptp_rx_msg_type_pdelay_resp;
+	u64 ptp_rx_msg_type_pdelay_follow_up;
+	u64 ptp_rx_msg_type_announce;
+	u64 ptp_rx_msg_type_management;
+	u64 ptp_rx_msg_pkt_reserved_type;
+	u64 ptp_frame_type;
+	u64 ptp_ver;
+	u64 timestamp_dropped;
+	u64 av_pkt_rcvd;
+	u64 av_tagged_pkt_rcvd;
+	u64 vlan_tag_priority_val;
+	u64 l3_filter_match;
+	u64 l4_filter_match;
+	u64 l3_l4_filter_no_match;
+	/* PCS */
+	u64 irq_pcs_ane_n;
+	u64 irq_pcs_link_n;
+	u64 irq_rgmii_n;
+	u64 pcs_link;
+	u64 pcs_duplex;
+	u64 pcs_speed;
+	/* debug register */
+	u64 mtl_tx_status_fifo_full;
+	u64 mtl_tx_fifo_not_empty;
+	u64 mmtl_fifo_ctrl;
+	u64 mtl_tx_fifo_read_ctrl_write;
+	u64 mtl_tx_fifo_read_ctrl_wait;
+	u64 mtl_tx_fifo_read_ctrl_read;
+	u64 mtl_tx_fifo_read_ctrl_idle;
+	u64 mac_tx_in_pause;
+	u64 mac_tx_frame_ctrl_xfer;
+	u64 mac_tx_frame_ctrl_idle;
+	u64 mac_tx_frame_ctrl_wait;
+	u64 mac_tx_frame_ctrl_pause;
+	u64 mac_gmii_tx_proto_engine;
+	u64 mtl_rx_fifo_fill_level_full;
+	u64 mtl_rx_fifo_fill_above_thresh;
+	u64 mtl_rx_fifo_fill_below_thresh;
+	u64 mtl_rx_fifo_fill_level_empty;
+	u64 mtl_rx_fifo_read_ctrl_flush;
+	u64 mtl_rx_fifo_read_ctrl_read_data;
+	u64 mtl_rx_fifo_read_ctrl_status;
+	u64 mtl_rx_fifo_read_ctrl_idle;
+	u64 mtl_rx_fifo_ctrl_active;
+	u64 mac_rx_frame_ctrl_fifo;
+	u64 mac_gmii_rx_proto_engine;
+	/* TSO */
+	u64 tx_tso_frames;
+	u64 tx_tso_nfrags;
+	/* EST */
+	u64 mtl_est_cgce;
+	u64 mtl_est_hlbs;
+	u64 mtl_est_hlbf;
+	u64 mtl_est_btre;
+	u64 mtl_est_btrlm;
+
+	/* PF/VF Events */
+	u64 rst_start_count;	/* reset start noitify count */
+	u64 rst_finish_count;	/* reset finish notify count */
+	u64 rst_start_ok_count;
+	u64 rst_finish_ok_count;
+	u64 rst_start_accept_count;
+	u64 rst_finish_accept_count;
+	u64 normal_rst_count;
+	u64 tx_timeout_rst_count;
+	u64 dma_chan_err_rst_count;
+
+	u64 tx_frames_129_to_256;
+	u64 tx_frames_65_to_128;
+	u64 tx_frames_33_to_64;
+	u64 tx_frames_17_to_32;
+	u64 tx_frames_16_below;
+	/* per queue statistics */
+	struct dn200_txq_stats txq_stats[MTL_MAX_TX_QUEUES];
+	struct dn200_rxq_stats rxq_stats[MTL_MAX_RX_QUEUES];
+};
+
+/* Safety Feature statistics exposed by ethtool */
+struct dn200_safety_stats {
+	u64 mac_errors[32];
+	u64 mtl_errors[32];
+	u64 dma_errors[32];
+};
+
+#define DN200_FLOW_ACTION_DROP		BIT(0)
+#define DN200_FLOW_ACTION_ROUTE	BIT(1)
+
+#define DN200_FLOW_TYPE_V4 BIT(0)
+#define DN200_FLOW_TYPE_V6 BIT(1)
+#define DN200_FLOW_TYPE_SA BIT(2)
+#define DN200_FLOW_TYPE_DA BIT(3)
+#define DN200_FLOW_TYPE_UDP BIT(4)
+#define DN200_FLOW_TYPE_TCP BIT(5)
+#define DN200_FLOW_TYPE_DPORT BIT(6)
+#define DN200_FLOW_TYPE_SPORT BIT(7)
+
+struct dn200_fdir_info {
+	u32 l4_udp_count;
+	u32 l4_tcp_count;
+};
+
+struct dn200_fdir_filter {
+	/* filter ipnut set */
+	u8 flow_type;
+	/*enable */
+	bool enable;
+	/*action */
+	u8 action;
+	/*route dst */
+	u8 queue;
+	/*reg idx */
+	u8 reg_idx;
+	/* TX packet view of src and dst */
+	u32 dst_ip;
+	u32 dst_ip_mask;
+	int xgmac_mask_dst;
+	u32 src_ip;
+	u32 src_ip_mask;
+	int xgmac_mask_src;
+	/*ip6 */
+#define DN200_L3L4_IPV6_SA	BIT(1)
+#define DN200_L3L4_IPV6_DA	BIT(2)
+	u8 ip6_address;
+	u32 ip6[4];
+	u32 ip6_mask[4];
+	u16 src_port;
+	u16 dst_port;
+};
+
+/* Number of fields in Safety Stats */
+#define DN200_SAFETY_FEAT_SIZE	\
+	(sizeof(struct dn200_safety_stats) / sizeof(u64))
+
+/* CSR Frequency Access Defines*/
+#define CSR_F_35M	35000000
+#define CSR_F_60M	60000000
+#define CSR_F_100M	100000000
+#define CSR_F_150M	150000000
+#define CSR_F_250M	250000000
+#define CSR_F_300M	300000000
+
+#define	MAC_CSR_H_FRQ_MASK	0x20
+
+#define HASH_TABLE_SIZE 64
+#define PAUSE_TIME 0xffff
+
+/* Flow Control defines */
+#define FLOW_OFF	0
+#define FLOW_RX		1
+#define FLOW_TX		2
+#define FLOW_AUTO	(FLOW_OFF)
+
+/* PCS defines */
+#define DN200_PCS_RGMII	BIT(0)
+#define DN200_PCS_SGMII	BIT(1)
+#define DN200_PCS_TBI	BIT(2)
+#define DN200_PCS_RTBI	BIT(3)
+
+#define SF_DMA_MODE 1		/* DMA STORE-AND-FORWARD Operation Mode */
+
+/* DAM HW feature register fields */
+#define DMA_HW_FEAT_MIISEL	0x00000001	/* 10/100 Mbps Support */
+#define DMA_HW_FEAT_GMIISEL	0x00000002	/* 1000 Mbps Support */
+#define DMA_HW_FEAT_HDSEL	0x00000004	/* Half-Duplex Support */
+#define DMA_HW_FEAT_EXTHASHEN	0x00000008	/* Expanded DA Hash Filter */
+#define DMA_HW_FEAT_HASHSEL	0x00000010	/* HASH Filter */
+#define DMA_HW_FEAT_ADDMAC	0x00000020	/* Multiple MAC Addr Reg */
+#define DMA_HW_FEAT_PCSSEL	0x00000040	/* PCS registers */
+#define DMA_HW_FEAT_L3L4FLTREN	0x00000080	/* Layer 3 & Layer 4 Feature */
+#define DMA_HW_FEAT_SMASEL	0x00000100	/* SMA(MDIO) Interface */
+#define DMA_HW_FEAT_RWKSEL	0x00000200	/* PMT Remote Wakeup */
+#define DMA_HW_FEAT_MGKSEL	0x00000400	/* PMT Magic Packet */
+#define DMA_HW_FEAT_MMCSEL	0x00000800	/* RMON Module */
+#define DMA_HW_FEAT_TSVER1SEL	0x00001000	/* Only IEEE 1588-2002 */
+#define DMA_HW_FEAT_TSVER2SEL	0x00002000	/* IEEE 1588-2008 PTPv2 */
+#define DMA_HW_FEAT_EEESEL	0x00004000	/* Energy Efficient Ethernet */
+#define DMA_HW_FEAT_AVSEL	0x00008000	/* AV Feature */
+#define DMA_HW_FEAT_TXCOESEL	0x00010000	/* Checksum Offload in Tx */
+#define DMA_HW_FEAT_RXTYP1COE	0x00020000	/* IP COE (Type 1) in Rx */
+#define DMA_HW_FEAT_RXTYP2COE	0x00040000	/* IP COE (Type 2) in Rx */
+#define DMA_HW_FEAT_RXFIFOSIZE	0x00080000	/* Rx FIFO > 2048 Bytes */
+#define DMA_HW_FEAT_RXCHCNT	0x00300000	/* No. additional Rx Channels */
+#define DMA_HW_FEAT_TXCHCNT	0x00c00000	/* No. additional Tx Channels */
+#define DMA_HW_FEAT_ENHDESSEL	0x01000000	/* Alternate Descriptor */
+/* Timestamping with Internal System Time */
+#define DMA_HW_FEAT_INTTSEN	0x02000000
+#define DMA_HW_FEAT_FLEXIPPSEN	0x04000000	/* Flexible PPS Output */
+#define DMA_HW_FEAT_SAVLANINS	0x08000000	/* Source Addr or VLAN */
+#define DMA_HW_FEAT_ACTPHYIF	0x70000000	/* Active/selected PHY iface */
+#define DEFAULT_DMA_PBL		8
+
+/* MSI defines */
+#define DN200_MSI_VEC_MAX	32
+
+/* PCS status and mask defines */
+#define	PCS_ANE_IRQ		BIT(2)	/* PCS Auto-Negotiation */
+#define	PCS_LINK_IRQ		BIT(1)	/* PCS Link */
+#define	PCS_RGSMIIIS_IRQ	BIT(0)	/* RGMII or SMII Interrupt */
+
+/* Max/Min RI Watchdog Timer count value */
+#define MAX_DMA_RIWT	0xff
+#define MIN_DMA_RIWT	0x01	/* min is 500 ns, but the CPU loading will be higher */
+/* 112 us, frequency is 500MHz,
+ * total us = (1000 000/500 000 000) * 512 * 0x6E = 112us
+ */
+#define DEF_DMA_RIWT	0x6E
+/* Tx coalesce parameters */
+/* for 10% of 10G 64byte RFC2544, default ring size 512,
+ * total us (1 * 1000000) * 512 / (1480000) ~= 345us
+ */
+#define DN200_COAL_TX_TIMER	200
+#define DN200_MAX_COAL_TX_TICK	100000
+#define DN200_MAX_COAL_RX_TICK	8160
+#define DN200_MIN_CLAL_TX_TIME	10
+#define DN200_TX_MAX_FRAMES	256
+#define DN200_TX_FRAMES \
+	64 /*change from 25 to 14, as the tx performace is too lower is 3.14 kernel */
+
+/* Rx coalesce parameters */
+#define DN200_RX_MAX_FRAMES	256
+#define DN200_RX_FRAMES		48
+#define DN200_RX_MIN_FRAMES	16
+#define DN200_RX_MAX_REFILL_SIZE	(64)
+/* Packets types */
+enum packets_types {
+	PACKET_AVCPQ = 0x1,	/* AV Untagged Control packets */
+	PACKET_PTPQ = 0x2,	/* PTP Packets */
+	PACKET_DCBCPQ = 0x3,	/* DCB Control Packets */
+	PACKET_UPQ = 0x4,	/* Untagged Packets */
+	PACKET_MCBCQ = 0x5,	/* Multicast & Broadcast Packets */
+};
+
+/* Rx IPC status */
+enum rx_frame_status {
+	good_frame = 0x0,
+	discard_frame = BIT(0),
+	csum_none = BIT(1),
+	llc_snap = BIT(2),
+	dma_own = BIT(3),
+	rx_not_ls = BIT(4),
+	buf_len_err = BIT(5),
+};
+
+/* Tx status */
+enum tx_frame_status {
+	tx_done = 0x0,
+	tx_not_ls = 0x1,
+	tx_err = 0x2,
+	tx_dma_own = 0x4,
+};
+
+enum rx_frame_err_types {
+	watchdog_timeout_err = 0x1,
+	invalid_code_err = 0x2,
+	crc_err = 0x3,
+	giant_packet_err = 0x4,
+	ip_header_err = 0x5,
+	l4_chksum_err = 0x6,
+	overflow_err = 0x7,
+	bus_err = 0x8,
+	length_err = 0x9,
+	good_runt_packet_err = 0xa,
+	dribble_err = 0xc,
+	safety_err = 0xf,
+};
+
+enum tunnel_rx_frame_err_types {
+	outer_ip_header_err = 0x5,
+	outer_l4_chksum_err = 0x6,
+	inner_ip_header_err = 0x9,
+	inner_l4_chksum_err = 0xa,
+	invalid_tunnel_header_field = 0xb,
+};
+
+enum dma_irq_status {
+	tx_hard_error = 0x1,
+	tx_hard_error_bump_tc = 0x2,
+	handle_rx = 0x4,
+	handle_tx = 0x8,
+};
+
+enum dma_irq_dir {
+	DMA_DIR_RX = 0x1,
+	DMA_DIR_TX = 0x2,
+	DMA_DIR_RXTX = 0x3,
+};
+
+enum request_irq_err {
+	REQ_IRQ_ERR_ALL,
+	REQ_IRQ_ERR_RXTX,
+	REQ_IRQ_ERR_TX,
+	REQ_IRQ_ERR_RX,
+	REQ_IRQ_ERR_SFTY_UE,
+	REQ_IRQ_ERR_SFTY_CE,
+	REQ_IRQ_ERR_LPI,
+	REQ_IRQ_ERR_MAC,
+	REQ_IRQ_ERR_NO,
+};
+
+/* EEE and LPI defines */
+#define	CORE_IRQ_TX_PATH_IN_LPI_MODE	BIT(0)
+#define	CORE_IRQ_TX_PATH_EXIT_LPI_MODE	BIT(1)
+#define	CORE_IRQ_RX_PATH_IN_LPI_MODE	BIT(2)
+#define	CORE_IRQ_RX_PATH_EXIT_LPI_MODE	BIT(3)
+
+/* FPE defines */
+#define FPE_EVENT_UNKNOWN		0
+#define FPE_EVENT_TRSP			BIT(0)
+#define FPE_EVENT_TVER			BIT(1)
+#define FPE_EVENT_RRSP			BIT(2)
+#define FPE_EVENT_RVER			BIT(3)
+
+#define CORE_IRQ_MTL_RX_OVERFLOW	BIT(8)
+
+/* Physical Coding Sublayer */
+struct rgmii_adv {
+	unsigned int pause;
+	unsigned int duplex;
+	unsigned int lp_pause;
+	unsigned int lp_duplex;
+};
+
+#define DN200_PCS_PAUSE	1
+#define DN200_PCS_ASYM_PAUSE	2
+
+/* DMA HW capabilities */
+struct dma_features {
+	unsigned int mbps_10_100;
+	unsigned int mbps_1000;
+	unsigned int half_duplex;
+	unsigned int hash_filter;
+	unsigned int multi_addr;
+	unsigned int pcs;
+	unsigned int sma_mdio;
+	unsigned int pmt_remote_wake_up;
+	unsigned int pmt_magic_frame;
+	unsigned int rmon;
+	/* IEEE 1588-2002 */
+	unsigned int time_stamp;
+	/* IEEE 1588-2008 */
+	unsigned int atime_stamp;
+	/* 802.3az - Energy-Efficient Ethernet (EEE) */
+	unsigned int eee;
+	unsigned int av;
+	unsigned int hash_tb_sz;
+	unsigned int tsoen;
+	unsigned int dcben;
+	/* TX and RX csum */
+	unsigned int tx_coe;
+	unsigned int rx_coe;
+	unsigned int rx_coe_type1;
+	unsigned int rx_coe_type2;
+	unsigned int rxfifo_over_2048;
+	/* TX and RX number of channels */
+	unsigned int number_rx_channel;
+	unsigned int number_tx_channel;
+	/* TX and RX number of queues */
+	unsigned int number_rx_queues;
+	unsigned int number_tx_queues;
+	/* PPS output */
+	unsigned int pps_out_num;
+	/* Alternate (enhanced) DESC mode */
+	unsigned int enh_desc;
+	/* TX and RX FIFO sizes */
+	unsigned int tx_fifo_size;
+	unsigned int rx_fifo_size;
+	/* Automotive Safety Package */
+	unsigned int asp;
+	/* RX Parser */
+	unsigned int frpsel;
+	unsigned int frpbs;
+	unsigned int frpes;
+	unsigned int addr64;
+	unsigned int rssen;
+	unsigned int vlhash;
+	unsigned int sphen;
+	unsigned int vlins;
+	unsigned int dvlan;
+	unsigned int l3l4fnum;
+	unsigned int arpoffsel;
+	/* TSN Features */
+	unsigned int estwid;
+	unsigned int estdep;
+	unsigned int estsel;
+	unsigned int fpesel;
+	unsigned int tbssel;
+	/* Numbers of Auxiliary Snapshot Inputs */
+	unsigned int aux_snapshot_n;
+	/* DCB */
+	unsigned int tc_cnt;
+};
+
+/* RX Buffer size must be multiple of 4/8/16 bytes */
+#define BUF_SIZE_16KiB 16368
+#define BUF_SIZE_8KiB 8188
+#define BUF_SIZE_4KiB 4096
+#define BUF_SIZE_3KiB 3072
+#define BUF_SIZE_2KiB 2048
+
+/* Common MAC defines */
+#define MAC_CTRL_REG		0x00000000	/* MAC Control */
+#define MAC_ENABLE_TX		0x00000008	/* Transmitter Enable */
+#define MAC_ENABLE_RX		0x00000004	/* Receiver Enable */
+
+/* Default LPI timers */
+#define DN200_DEFAULT_LIT_LS	0x3E8
+#define DN200_DEFAULT_TWT_LS	0x1E
+#define DN200_ET_MAX		0xFFFFF
+
+#define DN200_CHAIN_MODE	0x1
+#define DN200_RING_MODE	0x2
+
+#define JUMBO_LEN		9000
+
+/* Receive Side Scaling */
+#define DN200_RSS_HASH_KEY_SIZE	40
+#define DN200_RSS_MAX_TABLE_SIZE	256
+
+/* VLAN */
+#define DN200_VLAN_NONE	0x0
+#define DN200_VLAN_REMOVE	0x1
+#define DN200_VLAN_INSERT	0x2
+#define DN200_VLAN_REPLACE	0x3
+
+/* TSO Desc flag */
+#define TSO_DESC_IS_FIRST	0x1
+#define TSO_DESC_IS_TUNNEL	0x2
+
+extern const struct dn200_desc_ops enh_desc_ops;
+extern const struct dn200_desc_ops ndesc_ops;
+
+struct mac_device_info;
+
+extern const struct dn200_hwtimestamp dn200_ptp;
+
+struct mac_link {
+	u32 speed_mask;
+	u32 speed10;
+	u32 speed100;
+	u32 speed1000;
+	u32 speed2500;
+	u32 duplex;
+	struct {
+		u32 speed2500;
+		u32 speed5000;
+		u32 speed10000;
+	} xgmii;
+	struct {
+		u32 speed25000;
+		u32 speed40000;
+		u32 speed50000;
+		u32 speed100000;
+	} xlgmii;
+};
+
+struct mii_regs {
+	unsigned int addr;	/* MII Address */
+	unsigned int data;	/* MII Data */
+	unsigned int addr_shift;	/* MII address shift */
+	unsigned int reg_shift;	/* MII reg shift */
+	unsigned int addr_mask;	/* MII address mask */
+	unsigned int reg_mask;	/* MII reg mask */
+	unsigned int clk_csr_shift;
+	unsigned int clk_csr_mask;
+};
+
+struct dn200_set_state {
+	bool is_promisc;
+	bool is_allmuslt;
+	int uc_num;
+	int mc_num;
+};
+
+struct mac_device_info {
+	struct dn200_priv *priv;
+	const struct dn200_ops *mac;
+	const struct dn200_desc_ops *desc;
+	const struct dn200_dma_ops *dma;
+	const struct dn200_mode_ops *mode;
+	const struct dn200_hwtimestamp *ptp;
+	const struct dn200_tc_ops *tc;
+	const struct dn200_mmc_ops *mmc;
+	struct dw_xpcs *xpcs;
+	struct mii_regs mii;	/* MII register Addresses */
+	struct mac_link link;
+	void __iomem *pcsr;	/* vpointer to device CSRs */
+	void __iomem *pmail;	/* vpointer to device mailbox */
+	unsigned int multicast_filter_bins;
+	unsigned int unicast_filter_entries;
+	unsigned int mcast_bits_log2;
+	unsigned int rx_csum;
+	unsigned int pcs;
+	unsigned int ps;
+	unsigned int xlgmac;
+	unsigned int max_vlan_num;
+	u32 vlan_filter[32];
+	unsigned int promisc;
+	bool vlan_fail_q_en;
+	u8 vlan_fail_q;
+	struct dn200_set_state set_state;
+	u64 reconfig_times;
+	u32 cfg_rxp_seq;
+};
+
+struct dn200_rx_routing {
+	u32 reg_mask;
+	u32 reg_shift;
+};
+
+enum dn200_txrx_mode_dir {
+	DN200_SET_NONE = BIT(0),
+	DN200_SET_TX_MODE = BIT(1),
+	DN200_SET_RX_MODE = BIT(2),
+	DN200_SET_TXRX_MODE = (DN200_SET_TX_MODE | DN200_SET_RX_MODE),
+};
+
+int dwmac100_setup(struct dn200_priv *priv);
+int dwmac1000_setup(struct dn200_priv *priv);
+int dwmac4_setup(struct dn200_priv *priv);
+int dwxgmac2_setup(struct dn200_priv *priv);
+int dpxgmac_setup(struct dn200_priv *priv);
+int dwxlgmac2_setup(struct dn200_priv *priv);
+
+void dn200_set_mac_addr(void __iomem *ioaddr, u8 addr[6],
+			unsigned int high, unsigned int low);
+void dn200_get_mac_addr(void __iomem *ioaddr, unsigned char *addr,
+			unsigned int high, unsigned int low);
+void dn200_set_mac(void __iomem *ioaddr, bool enable);
+
+void dn200_dwmac4_set_mac_addr(void __iomem *ioaddr, u8 addr[6],
+			       unsigned int high, unsigned int low);
+void dn200_dwmac4_get_mac_addr(void __iomem *ioaddr, unsigned char *addr,
+			       unsigned int high, unsigned int low);
+void dn200_dwmac4_set_mac(void __iomem *ioaddr, bool enable);
+
+extern const struct dn200_mode_ops ring_mode_ops;
+extern const struct dn200_mode_ops chain_mode_ops;
+extern const struct dn200_desc_ops dwmac4_desc_ops;
+
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/descs.h b/drivers/net/ethernet/dapustor/dn200/descs.h
new file mode 100644
index 000000000000..780433b0af29
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/descs.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DESCS_H__
+#define __DESCS_H__
+
+#include <linux/bitops.h>
+
+/* Normal receive descriptor defines */
+
+/* RDES0 */
+#define	RDES0_PAYLOAD_CSUM_ERR	BIT(0)
+#define	RDES0_CRC_ERROR		BIT(1)
+#define	RDES0_DRIBBLING		BIT(2)
+#define	RDES0_MII_ERROR		BIT(3)
+#define	RDES0_RECEIVE_WATCHDOG	BIT(4)
+#define	RDES0_FRAME_TYPE	BIT(5)
+#define	RDES0_COLLISION		BIT(6)
+#define	RDES0_IPC_CSUM_ERROR	BIT(7)
+#define	RDES0_LAST_DESCRIPTOR	BIT(8)
+#define	RDES0_FIRST_DESCRIPTOR	BIT(9)
+#define	RDES0_VLAN_TAG		BIT(10)
+#define	RDES0_OVERFLOW_ERROR	BIT(11)
+#define	RDES0_LENGTH_ERROR	BIT(12)
+#define	RDES0_SA_FILTER_FAIL	BIT(13)
+#define	RDES0_DESCRIPTOR_ERROR	BIT(14)
+#define	RDES0_ERROR_SUMMARY	BIT(15)
+#define	RDES0_FRAME_LEN_MASK	GENMASK(29, 16)
+#define RDES0_FRAME_LEN_SHIFT	16
+#define	RDES0_DA_FILTER_FAIL	BIT(30)
+#define	RDES0_OWN		BIT(31)
+			/* RDES1 */
+#define	RDES1_BUFFER1_SIZE_MASK		GENMASK(10, 0)
+#define	RDES1_BUFFER2_SIZE_MASK		GENMASK(21, 11)
+#define	RDES1_BUFFER2_SIZE_SHIFT	11
+#define	RDES1_SECOND_ADDRESS_CHAINED	BIT(24)
+#define	RDES1_END_RING			BIT(25)
+#define	RDES1_DISABLE_IC		BIT(31)
+
+/* Enhanced receive descriptor defines */
+
+/* RDES0 (similar to normal RDES) */
+#define	 ERDES0_RX_MAC_ADDR	BIT(0)
+
+/* RDES1: completely differ from normal desc definitions */
+#define	ERDES1_BUFFER1_SIZE_MASK	GENMASK(12, 0)
+#define	ERDES1_SECOND_ADDRESS_CHAINED	BIT(14)
+#define	ERDES1_END_RING			BIT(15)
+#define	ERDES1_BUFFER2_SIZE_MASK	GENMASK(28, 16)
+#define ERDES1_BUFFER2_SIZE_SHIFT	16
+#define	ERDES1_DISABLE_IC		BIT(31)
+
+/* Normal transmit descriptor defines */
+/* TDES0 */
+#define	TDES0_DEFERRED			BIT(0)
+#define	TDES0_UNDERFLOW_ERROR		BIT(1)
+#define	TDES0_EXCESSIVE_DEFERRAL	BIT(2)
+#define	TDES0_COLLISION_COUNT_MASK	GENMASK(6, 3)
+#define	TDES0_VLAN_FRAME		BIT(7)
+#define	TDES0_EXCESSIVE_COLLISIONS	BIT(8)
+#define	TDES0_LATE_COLLISION		BIT(9)
+#define	TDES0_NO_CARRIER		BIT(10)
+#define	TDES0_LOSS_CARRIER		BIT(11)
+#define	TDES0_PAYLOAD_ERROR		BIT(12)
+#define	TDES0_FRAME_FLUSHED		BIT(13)
+#define	TDES0_JABBER_TIMEOUT		BIT(14)
+#define	TDES0_ERROR_SUMMARY		BIT(15)
+#define	TDES0_IP_HEADER_ERROR		BIT(16)
+#define	TDES0_TIME_STAMP_STATUS		BIT(17)
+#define	TDES0_OWN			((u32)BIT(31))	/* silence sparse */
+/* TDES1 */
+#define	TDES1_BUFFER1_SIZE_MASK		GENMASK(10, 0)
+#define	TDES1_BUFFER2_SIZE_MASK		GENMASK(21, 11)
+#define	TDES1_BUFFER2_SIZE_SHIFT	11
+#define	TDES1_TIME_STAMP_ENABLE		BIT(22)
+#define	TDES1_DISABLE_PADDING		BIT(23)
+#define	TDES1_SECOND_ADDRESS_CHAINED	BIT(24)
+#define	TDES1_END_RING			BIT(25)
+#define	TDES1_CRC_DISABLE		BIT(26)
+#define	TDES1_CHECKSUM_INSERTION_MASK	GENMASK(28, 27)
+#define	TDES1_CHECKSUM_INSERTION_SHIFT	27
+#define	TDES1_FIRST_SEGMENT		BIT(29)
+#define	TDES1_LAST_SEGMENT		BIT(30)
+#define	TDES1_INTERRUPT			BIT(31)
+
+/* Enhanced transmit descriptor defines */
+/* TDES0 */
+#define	ETDES0_DEFERRED			BIT(0)
+#define	ETDES0_UNDERFLOW_ERROR		BIT(1)
+#define	ETDES0_EXCESSIVE_DEFERRAL	BIT(2)
+#define	ETDES0_COLLISION_COUNT_MASK	GENMASK(6, 3)
+#define	ETDES0_VLAN_FRAME		BIT(7)
+#define	ETDES0_EXCESSIVE_COLLISIONS	BIT(8)
+#define	ETDES0_LATE_COLLISION		BIT(9)
+#define	ETDES0_NO_CARRIER		BIT(10)
+#define	ETDES0_LOSS_CARRIER		BIT(11)
+#define	ETDES0_PAYLOAD_ERROR		BIT(12)
+#define	ETDES0_FRAME_FLUSHED		BIT(13)
+#define	ETDES0_JABBER_TIMEOUT		BIT(14)
+#define	ETDES0_ERROR_SUMMARY		BIT(15)
+#define	ETDES0_IP_HEADER_ERROR		BIT(16)
+#define	ETDES0_TIME_STAMP_STATUS	BIT(17)
+#define	ETDES0_SECOND_ADDRESS_CHAINED	BIT(20)
+#define	ETDES0_END_RING			BIT(21)
+#define	ETDES0_CHECKSUM_INSERTION_MASK	GENMASK(23, 22)
+#define	ETDES0_CHECKSUM_INSERTION_SHIFT	22
+#define	ETDES0_TIME_STAMP_ENABLE	BIT(25)
+#define	ETDES0_DISABLE_PADDING		BIT(26)
+#define	ETDES0_CRC_DISABLE		BIT(27)
+#define	ETDES0_FIRST_SEGMENT		BIT(28)
+#define	ETDES0_LAST_SEGMENT		BIT(29)
+#define	ETDES0_INTERRUPT		BIT(30)
+#define	ETDES0_OWN			((u32)BIT(31))	/* silence sparse */
+/* TDES1 */
+#define	ETDES1_BUFFER1_SIZE_MASK	GENMASK(12, 0)
+#define	ETDES1_BUFFER2_SIZE_MASK	GENMASK(28, 16)
+#define	ETDES1_BUFFER2_SIZE_SHIFT	16
+
+/* Extended Receive descriptor definitions */
+#define	ERDES4_IP_PAYLOAD_TYPE_MASK	GENMASK(6, 2)
+#define	ERDES4_IP_HDR_ERR		BIT(3)
+#define	ERDES4_IP_PAYLOAD_ERR		BIT(4)
+#define	ERDES4_IP_CSUM_BYPASSED		BIT(5)
+#define	ERDES4_IPV4_PKT_RCVD		BIT(6)
+#define	ERDES4_IPV6_PKT_RCVD		BIT(7)
+#define	ERDES4_MSG_TYPE_MASK		GENMASK(11, 8)
+#define	ERDES4_PTP_FRAME_TYPE		BIT(12)
+#define	ERDES4_PTP_VER			BIT(13)
+#define	ERDES4_TIMESTAMP_DROPPED	BIT(14)
+#define	ERDES4_AV_PKT_RCVD		BIT(16)
+#define	ERDES4_AV_TAGGED_PKT_RCVD	BIT(17)
+#define	ERDES4_VLAN_TAG_PRI_VAL_MASK	GENMASK(20, 18)
+#define	ERDES4_L3_FILTER_MATCH		BIT(24)
+#define	ERDES4_L4_FILTER_MATCH		BIT(25)
+#define	ERDES4_L3_L4_FILT_NO_MATCH_MASK	GENMASK(27, 26)
+
+/* Extended RDES4 message type definitions */
+#define RDES_EXT_NO_PTP			0x0
+#define RDES_EXT_SYNC			0x1
+#define RDES_EXT_FOLLOW_UP		0x2
+#define RDES_EXT_DELAY_REQ		0x3
+#define RDES_EXT_DELAY_RESP		0x4
+#define RDES_EXT_PDELAY_REQ		0x5
+#define RDES_EXT_PDELAY_RESP		0x6
+#define RDES_EXT_PDELAY_FOLLOW_UP	0x7
+#define RDES_PTP_ANNOUNCE		0x8
+#define RDES_PTP_MANAGEMENT		0x9
+#define RDES_PTP_SIGNALING		0xa
+#define RDES_PTP_PKT_RESERVED_TYPE	0xf
+/* Basic descriptor structure for normal and alternate descriptors */
+struct dma_desc {
+	__le32 des0;
+	__le32 des1;
+	__le32 des2;
+	__le32 des3;
+};
+
+/* Transmit checksum insertion control */
+#define	TX_CIC_FULL	3	/* Include IP header and pseudoheader */
+
+#endif /* __DESCS_H__ */
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200.h b/drivers/net/ethernet/dapustor/dn200/dn200.h
new file mode 100644
index 000000000000..0a7dfe43d616
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200.h
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_H__
+#define __DN200_H__
+
+#define SIMPLE_MODULE_VERSION
+#define DN200_RESOURCE_NAME	"dn200"
+#define DRV_MODULE_VERSION	"1.0.60"
+
+#include <linux/clk.h>
+#include <linux/dcbnl.h>
+#include <linux/hrtimer.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include "common.h"
+#include <linux/ptp_clock_kernel.h>
+#include <linux/net_tstamp.h>
+#include <linux/reset.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
+#include "dn200_self.h"
+#include "dn200_sriov.h"
+#include "dn200_spec_acc.h"
+#include "dn200_dcb.h"
+#include "dn200_iatu.h"
+#include "dn200_pool.h"
+#include "dn200_reg.h"
+
+#define LINK_UP_SET BIT(0)
+#define LINK_DOWN_SET BIT(1)
+#define TXRX_ITR_PROCESS_SELF 0
+#define TXRX_ITR_COMBINED 1
+
+#undef HAVE_AF_XDP_ZC_SUPPORT
+struct dn200_resources {
+	void __iomem *addr;
+	void __iomem *mail;
+	void __iomem *ctrl_addr;
+	u8 mac[ETH_ALEN];
+	int lpi_irq;
+	int irq;
+	int sfty_ce_irq;
+	int sfty_ue_irq;
+	int xpcs_vec;
+	int rx_irq[MTL_MAX_RX_QUEUES];
+	int tx_irq[MTL_MAX_TX_QUEUES];
+	struct ctrl_resource *ctrl;
+};
+
+enum dn200_txbuf_type {
+	DN200_TXBUF_T_SKB,
+	DN200_TXBUF_T_XDP_TX,
+	DN200_TXBUF_T_XDP_NDO,
+	DN200_TXBUF_T_XSK_TX,
+};
+
+enum dn200_txbuf_mem_type {
+	DN200_NORMAL,
+	DN200_DMA32,
+};
+
+struct dn200_tx_info {
+	dma_addr_t buf;
+	bool map_as_page;
+	unsigned int len;
+	bool last_segment;
+	bool is_jumbo;
+	enum dn200_txbuf_type buf_type;
+	atomic_t *iatu_ref_ptr;
+};
+
+#define DN200_TBS_AVAIL BIT(0)
+#define DN200_TBS_EN BIT(1)
+
+/* DMA32 tx buffer used for:
+ * 1. pure pf dma addr exceed 1TB;
+ * 2. sriov pf dma addr exceed 896GB;
+ * 3. sriov vf dma addr exceed 4GB
+ */
+struct dn200_tx_dma32_buff {
+	enum dn200_txbuf_mem_type mem_type;
+	struct page *page;
+	dma_addr_t buf;
+	int order;
+	unsigned int len;
+	atomic_t *iatu_ref_ptr;
+};
+
+/* Frequently used values are kept adjacent for cache effect */
+struct dn200_tx_queue {
+	u32 tx_count_frames;
+	int tbs;
+	struct work_struct tx_task;
+	struct hrtimer txtimer;
+	struct work_struct poll_tx_task;
+	struct hrtimer poll_txtimer;
+	u32 queue_index;
+	struct dn200_priv *priv_data;
+	struct dma_desc *dma_tx ____cacheline_aligned_in_smp;
+	union {
+		struct sk_buff **tx_skbuff;
+		struct xdp_frame **xdpf;
+	};
+	struct dn200_tx_info *tx_skbuff_dma;
+	struct dn200_tx_dma32_buff *tx_dma32_bufs;
+	atomic_t tx_scheduling;
+	struct dn200_queue_iatu_info iatu_info;
+	struct xsk_buff_pool *xsk_pool;
+	u32 xsk_frames_done;
+	unsigned int cur_tx;
+	unsigned int next_to_watch;
+	unsigned int dirty_tx;
+	dma_addr_t dma_tx_phy;
+	dma_addr_t origin_dma_tx_phy;
+	dma_addr_t tx_tail_addr;
+	atomic_t txtimer_running;
+	bool task_need_sch;
+	bool txtimer_need_sch;
+	unsigned int old_dirty_tx;
+	u32 mss;
+};
+
+/* one rx page can be used twice at the same time,
+ * one for protocol statck rx pkt, another for hw rx dma
+ */
+#define RX_PAGE_MAX_USED_COUNT 2
+enum {
+	FIRST_PAGE,
+	SECON_PAGE,
+};
+
+struct dn200_rx_buffer {
+	union {
+		struct {
+			struct page *page;
+			dma_addr_t kernel_addr;
+			dma_addr_t desc_addr;
+			__u32 page_offset;
+			u16 rx_times;
+			struct dn200_page_buf *pg_buf;
+		};
+		struct xdp_buff *xdp;
+	};
+	struct page *sec_page;
+	dma_addr_t sec_addr;
+	__u32 sec_page_offset; /* second page offsent sph */
+	struct dn200_page_buf *sec_pg_buf;
+};
+
+struct dn200_rx_queue {
+	u32 rx_count_frames;
+	u32 queue_index;
+	struct page_pool *page_pool;
+	struct dn200_rx_buffer *buf_pool;
+	struct dn200_priv *priv_data;
+	struct dma_desc *dma_rx ____cacheline_aligned_in_smp;
+	struct dn200_queue_iatu_info iatu_info;
+	atomic_t rx_scheduling;
+	unsigned int cur_rx;
+	unsigned int dirty_rx;
+	unsigned int alloc_rx;
+	unsigned int buf_alloc_num;
+	u32 rx_zeroc_thresh;
+	dma_addr_t origin_dma_rx_phy;
+	dma_addr_t dma_rx_phy;
+	u32 rx_tail_addr;
+	unsigned int state_saved;
+	struct {
+		struct sk_buff *skb;
+		unsigned int len;
+		unsigned int error;
+	} state;
+	struct dn200_bufpool *rx_pool;
+	struct work_struct poll_rx_task;
+	struct hrtimer poll_rxtimer;
+} ____cacheline_internodealigned_in_smp;
+
+#define ITR_COUNTDOWN_COUNT 3
+#define DN200_ITR_MASK 0x1fff
+#define DN200_ITR_MIN_INC 0x0002
+#define DN200_ITR_MIN_USECS 0x0002
+#define DN200_ITR_MIN_USECS_1G 0x14
+#define DN200_ITR_MAX_USECS 0x70
+#define DN200_ITR_MAX_RWT   0xff
+#define DN200_ITR_RWT_BOUND   0x14
+
+#define DN200_ITR_ADAPTIVE_LATENCY 0x8000
+#define DN200_ITR_ADAPTIVE_BULK 0x0000
+#define DN200_ITR_DYNAMIC_ITR 0x8000
+
+struct dn200_itr_info {
+	unsigned long next_update;
+	u64 packet;
+	u64 bytes;
+	u32 target_itr;	 /* target ITR setting for ring(s) */
+	u32 current_itr; /* current ITR setting for ring(s) */
+	u8 itr_countdown;
+	u16 itr_div;
+	u16 itr_setting;
+};
+
+struct itr_update_ops {
+	void (*dn200_rx_itr_update)(struct dn200_itr_info *itr,
+				struct dn200_priv *priv, u8 chan);
+};
+
+struct dn200_channel {
+	struct napi_struct rx_napi ____cacheline_aligned_in_smp;
+	struct napi_struct tx_napi ____cacheline_aligned_in_smp;
+	struct napi_struct agg_napi;
+	struct napi_struct rxtx_napi;
+	struct dn200_priv *priv_data;
+	bool in_sch;
+	struct dn200_rx_queue *rx_q;
+	spinlock_t lock; /* lock for dma channel hw setting */
+	u32 index;
+};
+struct dn200_tc_entry {
+	bool in_use;
+	bool in_hw;
+	bool is_last;
+	bool is_frag;
+	void *frag_ptr;
+	unsigned int table_pos;
+	u32 handle;
+	u32 prio;
+	struct {
+		u32 match_data;
+		u32 match_en;
+		u8 af : 1;
+		u8 rf : 1;
+		u8 im : 1;
+		u8 nc : 1;
+		u8 res1 : 4;
+		u8 frame_offset;
+		u8 ok_index;
+		u8 dma_ch_no;
+		u32 res2;
+	} __packed val;
+};
+
+#define DN200_PPS_MAX 4
+struct dn200_pps_cfg {
+	bool available;
+	struct timespec64 start;
+	struct timespec64 period;
+};
+
+#define DN200_RSS_IP2TE 1
+#define DN200_RSS_UDP4TE 2
+#define DN200_RSS_TCP4TE 4
+
+struct dn200_rss {
+	int enable;
+	u32 rss_flags;
+	u8 key[DN200_RSS_HASH_KEY_SIZE];
+	u32 table[DN200_RSS_MAX_TABLE_SIZE];
+};
+
+struct dn200_flow_entry {
+	unsigned long cookie;
+	unsigned long action;
+	u8 ip_proto;
+	int in_use;
+	int idx;
+	int is_l4;
+};
+
+/* Rx Frame Steering */
+enum dn200_rfs_type {
+	DN200_RFS_T_VLAN,
+	DN200_RFS_T_MAX,
+};
+
+enum dn200_pf_rxp_set_type {
+	RXP_SET_VLAN_FIL = BIT(0),
+	RXP_SET_VLAN_ID = BIT(1),
+	RXP_SET_UMAC = BIT(2),
+	RXP_SET_FIL = BIT(3),
+	RXP_CLEAR_VF_RXP = BIT(4),
+};
+
+struct dn200_rfs_entry {
+	unsigned long cookie;
+	int in_use;
+	int type;
+	int tc;
+};
+
+struct dn200_mem_info {
+	struct page *page;
+	int page_ref_bias;
+	dma_addr_t dma_addr;
+	dma_addr_t base_addr;
+};
+
+struct dn200_page_pool {
+	struct device *device;
+	int reserve_page;
+	struct dn200_mem_info *mem_info;
+	int page_order;
+	int total_pages; /* total pages of the page pool */
+	int alloced_pages;
+};
+
+struct dn200_priv {
+	/* Frequently used values are kept adjacent for cache effect */
+	u32 tx_coal_frames[MTL_MAX_TX_QUEUES];
+	u32 tx_coal_timer[MTL_MAX_TX_QUEUES];
+	u32 rx_coal_frames[MTL_MAX_RX_QUEUES];
+	u32 tx_coal_frames_set[MTL_MAX_TX_QUEUES];
+	u64 tx_mem_copy;
+	int tx_coalesce;
+	int hwts_tx_en;
+	bool tx_path_in_lpi_mode;
+	bool tso;
+	int sph;
+	int sph_cap;
+	u32 sarc_type;
+	u8 txrx_itr_combined;
+	cpumask_t *cpu_mask;
+	unsigned int dma_buf_sz;
+	u32 rx_riwt[MTL_MAX_RX_QUEUES];
+	u32 rx_rius[MTL_MAX_RX_QUEUES]; /* rx interrupt coalesce: rx-usecs */
+	int hwts_rx_en;
+	u32 rx_itr_usec; /* rx itr usecs based on mtu & rx desc ring size */
+	u32 rx_itr_usec_min;
+	const struct itr_update_ops *dn200_update_ops;
+	void __iomem *ioaddr;
+	struct net_device *dev;
+	struct device *device;
+	struct mac_device_info *hw;
+	int (*hwif_quirks)(struct dn200_priv *priv);
+	struct mutex lock; /* lock for eee setting */
+
+	/* RX Queue */
+	struct dn200_rx_queue rx_queue[MTL_MAX_RX_QUEUES];
+	unsigned int dma_rx_size;
+
+	/* TX Queue */
+	struct dn200_tx_queue tx_queue[MTL_MAX_TX_QUEUES];
+	unsigned int dma_tx_size;
+
+	/* Generic channel for NAPI */
+	struct dn200_channel channel[DN200_CH_MAX];
+
+	/* RX Queue */
+	struct dn200_itr_info rx_intr[MTL_MAX_RX_QUEUES];
+
+	/* TX Queue */
+	struct dn200_itr_info tx_intr[MTL_MAX_RX_QUEUES];
+
+	struct dn200_priv_iatu_info iatu_info;
+	bool dma32_iatu_used;
+
+	int speed;
+	u32 max_usecs;
+	u32 min_usecs;
+	bool flow_ctrl_an;
+	unsigned int flow_ctrl;
+	unsigned int pause;
+	unsigned int duplex;
+	struct mii_bus *mii;
+	int mii_irq[PHY_MAX_ADDR];
+
+	struct dn200_extra_stats xstats ____cacheline_aligned_in_smp;
+	struct dn200_safety_stats sstats;
+	struct plat_dn200enet_data *plat;
+	struct plat_dn200_data *plat_ex;
+	struct dma_features dma_cap;
+	struct dn200_counters mmc;
+	struct dn200_swcounters swc;
+	int hw_cap_support;
+	int chip_id;
+	u32 msg_enable;
+	int clk_csr;
+	struct timer_list eee_ctrl_timer;
+	struct timer_list reset_timer;
+	struct timer_list keepalive_timer;
+	struct timer_list upgrade_timer;
+	int lpi_irq;
+	int eee_enabled;
+	int eee_active;
+	int tx_lpi_timer;
+	int tx_lpi_enabled;
+	int eee_tw_timer;
+	bool eee_sw_timer_en;
+	unsigned int mode;
+	struct hwtstamp_config tstamp_config;
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info ptp_clock_ops;
+	unsigned int default_addend;
+	u32 sub_second_inc;
+	u32 systime_flags;
+	u32 adv_ts;
+	int use_riwt;
+	spinlock_t ptp_lock; /* ptp lock for reg setting */
+	/* Protects auxiliary snapshot registers from concurrent access. */
+	struct mutex aux_ts_lock;
+
+	void __iomem *mmcaddr;
+	void __iomem *ptpaddr;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	int sfty_ce_irq;
+	int sfty_ue_irq;
+	int xpcs_irq;
+	int rx_irq[MTL_MAX_RX_QUEUES];
+	int tx_irq[MTL_MAX_TX_QUEUES];
+	/*irq name */
+	char int_name_mac[IFNAMSIZ + 9];
+	char int_name_xpcs[IFNAMSIZ + 9];
+	char int_name_wol[IFNAMSIZ + 9];
+	char int_name_lpi[IFNAMSIZ + 9];
+	char int_name_sfty_ce[IFNAMSIZ + 10];
+	char int_name_sfty_ue[IFNAMSIZ + 10];
+	char int_name_rx_irq[MTL_MAX_RX_QUEUES][IFNAMSIZ + 14];
+	char int_name_tx_irq[MTL_MAX_TX_QUEUES][IFNAMSIZ + 18];
+
+	struct dentry *dbgfs_dir;
+
+	unsigned long state;
+	struct workqueue_struct *wq;
+	struct workqueue_struct *tx_wq;
+	struct work_struct service_task;
+	struct work_struct reconfig_task;
+	struct work_struct retask;
+	struct work_struct vf_process_task;
+	struct work_struct vf_linkset_task;
+	struct work_struct rxp_task;
+	/* Workqueue for handling FPE hand-shaking */
+	unsigned long fpe_task_state;
+	struct workqueue_struct *fpe_wq;
+	struct work_struct fpe_task;
+	char wq_name[IFNAMSIZ + 4];
+
+	/* TC Handling */
+	unsigned int tc_entries_max;
+	unsigned int tc_off_max;
+	struct dn200_tc_entry *tc_entries;
+	unsigned int flow_entries_max;
+	u32 fdir_map;
+	u32 fdir_counts;
+	struct dn200_flow_entry *flow_entries;
+	struct dn200_fdir_filter *fdir_enties;
+	struct dn200_fdir_info fdir_info;
+
+	/* Pulse Per Second output */
+	struct dn200_pps_cfg pps[DN200_PPS_MAX];
+
+	/* Receive Side Scaling */
+	struct dn200_rss rss;
+
+	/* XDP BPF Program */
+	unsigned long *af_xdp_zc_qps;
+	struct bpf_prog *xdp_prog;
+	u64 eth_priv_flags;
+	unsigned int vxlan_port;
+	bool rec_all;
+	/* DCB support */
+	struct ieee_ets *ets;
+	struct ieee_pfc *pfc;
+	unsigned int q2tc_map[MTL_MAX_TX_QUEUES];
+	unsigned int prio2q_map[IEEE_8021QAZ_MAX_TCS];
+	unsigned int pfcq[MTL_MAX_TX_QUEUES];
+	u8 num_tcs;
+	u8 prio2tc_bitmap[IEEE_8021QAZ_MAX_TCS];
+	u8 dscp_app_cnt;
+	u8 dscp2up[DN200_TRUST_DSCP];
+	struct dn200_page_pool page_pool;
+	struct dn200_bufpool buf_pool;
+	struct dn200_vf_rxp_async_info async_info[DN200_MAX_VF_NUM];
+	enum dn200_pf_rxp_set_type pf_rxp_set;
+	bool vlan_fil_enable;
+	u8 clear_vf_rxp_bitmap;
+	int numa_node;
+	u8 speed_cmd;
+
+	u8 vf_link_forced[DN200_MAX_VF_NUM];
+	u8 vf_link_action;
+	bool blink_state_last;
+	int mtl_queue_fifo_avg;
+	int mtl_queue_fifo_more;
+	int tx_fifo_queue_0;
+	int vf_tx_fifo_size;
+	struct device_attribute *temp_attr;
+	int upgrade_time;
+	bool flag_upgrade;
+	bool vf_sw_close_flag;
+	bool update_fail;
+};
+
+enum dn200_state {
+	DN200_DOWN,
+	DN200_RESET_REQUESTED,
+	DN200_RESETING,
+	DN200_SERVICE_SCHED,
+	DN200_DCB_DOWN,
+	DN200_SUSPENDED,
+	DN200_NET_SUSPENDED,
+	DN200_SFP_IN_INIT,
+	DN200_ERR_RESET,
+	DN200_VF_NOTIFY_PF_RESET,
+	DN200_VF_FLOW_STATE_SET,
+	DN200_IN_REMOVE,
+	DN200_DEV_ERR_CLOSE,
+	DN200_VF_IN_STOP,
+	DN200_DEV_INIT,		/* can't set rxp in dev init state */
+	DN200_IATU_INIT,
+	DN200_PCIE_UNAVAILD,
+	DN200_RXP_SETTING = 17,
+	DN200_RXP_NEED_CHECK = 18,
+	DN200_MAC_LINK_DOWN = 19,
+	DN200_IN_TASK = 20,
+	DN200_VF_FLOW_OPEN = 21,
+	DN200_VF_FLOW_CLOSE = 22,
+	DN200_VF_FLOW_OPEN_SET = 23,
+	DN200_PF_NORMAL_CLOSE = 24,
+	DN200_PF_NORMAL_OPEN = 25,
+	DN200_PF_FLOW_NORMAL_SET = 26,
+	DN200_PF_DOWN_UPGRADE = 27,
+	DN200_SYS_SUSPENDED = 28,
+	DN200_IS_BONDING = 29,
+	DN200_PROBE_FINISHED = 30,
+	DN200_UP = 31,
+};
+
+enum dn200_err_rst_type {
+	DN200_TX_TIMEOUT = 0,
+	DN200_SAFETY_FEAT_INT = 1,
+	DN200_DMA_CHAN_ERR = 2,
+	DN200_TX_RESET = 3,
+	DN200_VF_TO_PF = 4,
+	DN200_NORMAL_RESET = 5,
+	POLL_FW_CQ_TIMEOUT = 6,
+	DN200_PCIE_UNAVAILD_ERR = 7,
+	DN200_PHY_MPLLA_UNLOCK = 8,
+	DN200_DMA_DEBUG_ERR = 9,
+	DN200_FC_VF_STOP = 10,
+};
+
+int dn200_page_buf_alloc(struct dn200_priv *priv);
+int dn200_page_buf_get(struct dn200_priv *priv, u32 queue);
+int dn200_page_buf_put(struct dn200_priv *priv, u32 queue);
+
+int dn200_mdio_unregister(struct net_device *ndev);
+int dn200_mdio_register(struct net_device *ndev);
+int dn200_mdio_reset(struct mii_bus *mii);
+int dn200_xpcs_setup(struct mii_bus *mii);
+void dn200_set_ethtool_ops(struct net_device *netdev);
+int dn200_init_tstamp_counter(struct dn200_priv *priv, u32 systime_flags);
+void dn200_ptp_register(struct dn200_priv *priv);
+void dn200_ptp_unregister(struct dn200_priv *priv);
+int dn200_xdp_open(struct net_device *dev);
+void dn200_xdp_release(struct net_device *dev);
+int dn200_resume(struct device *dev);
+int dn200_suspend(struct device *dev);
+int dn200_dvr_remove(struct device *dev);
+int dn200_dvr_probe(struct device *device,
+					struct plat_dn200enet_data *plat_dat,
+					struct plat_dn200_data *plat_ex,
+					struct dn200_resources *res);
+void dn200_disable_eee_mode(struct dn200_priv *priv);
+bool dn200_eee_init(struct dn200_priv *priv);
+int dn200_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt);
+int dn200_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size);
+void dn200_fpe_handshake(struct dn200_priv *priv, bool enable);
+int dn200_reinit_hwts(struct net_device *dev, bool initial, u32 new_flags);
+static inline bool dn200_xdp_is_enabled(struct dn200_priv *priv)
+{
+	return !!priv->xdp_prog;
+}
+int dn200_dev_event(struct notifier_block *unused,
+					unsigned long event, void *ptr);
+void self_reset(struct dn200_priv *priv);
+void dn200_disable_rx_queue(struct dn200_priv *priv, u32 queue);
+void dn200_enable_rx_queue(struct dn200_priv *priv, u32 queue);
+void dn200_disable_tx_queue(struct dn200_priv *priv, u32 queue);
+void dn200_enable_tx_queue(struct dn200_priv *priv, u32 queue);
+int dn200_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags);
+void dn200_dma_operation_mode(struct dn200_priv *priv);
+void dn200_enable_all_queues(struct dn200_priv *priv);
+int dn200_tx_clean(struct dn200_priv *priv, int budget, u32 queue);
+void dn200_tx_iatu_ref_clean(struct dn200_priv *priv, struct dn200_tx_queue *tx_q);
+void dn200_normal_reset(struct dn200_priv *priv);
+void dn200_fw_err_dev_close(struct dn200_priv *priv);
+void dn200_global_err(struct dn200_priv *priv, enum dn200_err_rst_type err_type);
+void dn200_vf_work(struct dn200_priv *priv);
+void dn200_async_rxp_work(struct dn200_priv *priv);
+void dn200_vf_mac_change(struct dn200_priv *priv);
+void dn200_vf_link_set(struct dn200_priv *priv, u8 link_reset);
+/*#if IS_ENABLED(CONFIG_DN200_SELFTESTS)*/
+#define DN200_SELFTEST
+void dn200_selftest_run(struct net_device *dev,
+						 struct ethtool_test *etest, u64 *buf);
+void dn200_selftest_get_strings(struct dn200_priv *priv, u8 *data);
+int dn200_selftest_get_count(struct dn200_priv *priv);
+
+#define DEFAULT_BUFSIZE 1536
+static inline u32 dn200_usec2riwt(u64 usec, struct dn200_priv *priv)
+{
+	unsigned long clk;
+
+	clk = priv->plat->clk_ref_rate;
+	if (!clk)
+		return 0;
+	/* rwtu(rx watchdog timer count unit) use 512, so right shift 9*/
+	return (usec * (clk / 1000000)) >> 9;
+}
+
+static inline unsigned int dn200_get_bfsize(void)
+{
+	int ret = 0;
+
+	/* We use a 1536 buffer size for standard Ethernet mtu or jumbo frame.
+	 * This gives us enough room for shared info(320) and 192 bytes of padding.
+	 * When (NET_SKB_PAD + 1536) bigger than (2048- sizeof shared info(320)),
+	 * use suitable buff size
+	 */
+	if ((NET_SKB_PAD + DEFAULT_BUFSIZE) <= SKB_WITH_OVERHEAD(BUF_SIZE_2KiB))
+		ret = DEFAULT_BUFSIZE - NET_IP_ALIGN;
+	else
+		ret = SKB_WITH_OVERHEAD(BUF_SIZE_2KiB) - NET_SKB_PAD;
+
+	ret = ALIGN_DOWN(ret, 16);
+	return ret;
+}
+
+static inline unsigned int dn200_rx_offset(struct dn200_priv *priv)
+{
+	unsigned int page_size, pad_size, buf_len;
+
+
+	buf_len = dn200_get_bfsize();
+	page_size = ALIGN(buf_len, DN200_RX_BUF_SIZE);
+	pad_size = SKB_WITH_OVERHEAD(page_size) - buf_len;
+	return pad_size;
+}
+
+/* page order always use zero as we use page pool now */
+#define dn200_rx_pg_order_get(p) (0)
+
+static inline int dn200_rx_pg_size_get(struct dn200_priv *priv)
+{
+	return (PAGE_SIZE << dn200_rx_pg_order_get(priv));
+}
+
+static inline int dn200_rx_refill_size(struct dn200_priv *priv)
+{
+	return (priv->dma_rx_size >= 128) ? DN200_RX_MAX_REFILL_SIZE : (priv->dma_rx_size >> 1);
+}
+
+int dn200_tx_iatu_find(u64 tar_addr, struct dn200_tx_queue *tx_q, atomic_t **iatu_ref_ptr, u64 *base_addr);
+int dn200_rx_iatu_find(u64 tar_addr, struct dn200_priv *priv, u64 *base_addr);
+int dn200_iatu_init(struct dn200_priv *priv);
+void dn200_iatu_uninit(struct dn200_priv *priv);
+void dn200_axi_uninit_for_raid(struct dn200_priv *priv);
+void dn200_axi_init_for_raid(struct dn200_priv *priv);
+void dn200_iatu_display(struct dn200_priv *priv, struct seq_file *seq);
+int dn200_add_rx_iatu2tx(struct dn200_priv *priv);
+int dn200_datapath_close(struct dn200_priv *priv);
+void dn200_datapath_open(struct dn200_priv *priv);
+int dn200_vf_flow_state_process(struct dn200_priv *priv);
+void extern_phy_force_led(struct phy_device *phydev, struct dn200_priv *priv, u32 index, u32 mode);
+void extern_phy_init(struct phy_device *phydev, u8 hw_type);
+int extern_phy_read_status(struct phy_device *phydev);
+int extern_phy_pause_autoneg_result(struct phy_device *phydev, bool *tx_pause, bool *rx_pause);
+int extern_phy_mdix_status_get(struct phy_device *phydev, u8 *mdix, u8 *mdix_ctrl);
+int extern_phy_mdix_status_set(struct phy_device *phydev, u8 ctrl);
+int ytphy_read_ext(struct phy_device *phydev, u32 regnum);
+int ytphy_write_ext(struct phy_device *phydev, u32 regnum, u16 val);
+void dn200_start_all_dma(struct dn200_priv *priv);
+void dn200_stop_all_dma(struct dn200_priv *priv);
+int dn200_clean_all_tx_queues(struct dn200_priv *priv, u8 tx_queue_num);
+int dn200_clean_all_rx_queues(struct dn200_priv *priv);
+u32 dn200_riwt2usec(u32 riwt, struct dn200_priv *priv);
+void dn200_xgmac_clock_ctl(struct dn200_priv *priv);
+void dn200_normal_close_open(struct dn200_priv *priv);
+int dn200_config_interrupt(struct pci_dev *pdev,
+				  struct plat_dn200enet_data *plat,
+				  struct plat_dn200_data *plat_ex,
+				  struct dn200_resources *res, bool is_nvme_pf);
+
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_cfg.h b/drivers/net/ethernet/dapustor/dn200/dn200_cfg.h
new file mode 100644
index 000000000000..56a76341c2be
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_cfg.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_CFG_H__
+#define __DN200_CFG_H__
+
+#include <linux/platform_device.h>
+#include <linux/phy.h>
+
+#define MTL_MAX_RX_QUEUES	8
+#define MTL_MAX_TX_QUEUES	8
+#define DN200_CH_MAX		8
+
+#define DN200_RX_COE_NONE	0
+#define DN200_RX_COE_TYPE1	1
+#define DN200_RX_COE_TYPE2	2
+
+/* Define the macros for CSR clock range parameters to be passed by
+ * platform code.
+ * This could also be configured at run time using CPU freq framework.
+ */
+
+/* MDC Clock Selection define*/
+#define	DN200_CSR_60_100M	0x0	/* MDC = clk_scr_i/42 */
+#define	DN200_CSR_100_150M	0x1	/* MDC = clk_scr_i/62 */
+#define	DN200_CSR_20_35M	0x2	/* MDC = clk_scr_i/16 */
+#define	DN200_CSR_35_60M	0x3	/* MDC = clk_scr_i/26 */
+#define	DN200_CSR_150_250M	0x4	/* MDC = clk_scr_i/102 */
+#define	DN200_CSR_250_300M	0x5	/* MDC = clk_scr_i/122 */
+
+/* MTL algorithms identifiers */
+#define MTL_TX_ALGORITHM_WRR	0x0
+#define MTL_TX_ALGORITHM_WFQ	0x1
+#define MTL_TX_ALGORITHM_DWRR	0x2
+#define MTL_TX_ALGORITHM_SP	0x3
+#define MTL_RX_ALGORITHM_SP	0x4
+#define MTL_RX_ALGORITHM_WSP	0x5
+
+/* RX/TX Queue Mode */
+#define MTL_QUEUE_AVB		0x0
+#define MTL_QUEUE_DCB		0x1
+
+/* The MDC clock could be set higher than the IEEE 802.3
+ * specified frequency limit 0f 2.5 MHz, by programming a clock divider
+ * of value different than the above defined values. The resultant MDIO
+ * clock frequency of 12.5 MHz is applicable for the interfacing chips
+ * supporting higher MDC clocks.
+ * The MDC clock selection macros need to be defined for MDC clock rate
+ * of 12.5 MHz, corresponding to the following selection.
+ */
+#define DN200_CSR_I_4		0x8	/* clk_csr_i/4 */
+#define DN200_CSR_I_6		0x9	/* clk_csr_i/6 */
+#define DN200_CSR_I_8		0xA	/* clk_csr_i/8 */
+#define DN200_CSR_I_10		0xB	/* clk_csr_i/10 */
+#define DN200_CSR_I_12		0xC	/* clk_csr_i/12 */
+#define DN200_CSR_I_14		0xD	/* clk_csr_i/14 */
+#define DN200_CSR_I_16		0xE	/* clk_csr_i/16 */
+#define DN200_CSR_I_18		0xF	/* clk_csr_i/18 */
+
+/* AXI DMA Burst length supported */
+#define DMA_AXI_BLEN_4 BIT(1)
+#define DMA_AXI_BLEN_8 BIT(2)
+#define DMA_AXI_BLEN_16 BIT(3)
+#define DMA_AXI_BLEN_32 BIT(4)
+#define DMA_AXI_BLEN_64 BIT(5)
+#define DMA_AXI_BLEN_128 BIT(6)
+#define DMA_AXI_BLEN_256 BIT(7)
+#define DMA_AXI_BLEN_ALL                                                       \
+	(DMA_AXI_BLEN_4 | DMA_AXI_BLEN_8 | DMA_AXI_BLEN_16 | DMA_AXI_BLEN_32 | \
+	 DMA_AXI_BLEN_64 | DMA_AXI_BLEN_128 | DMA_AXI_BLEN_256)
+
+/* Platform data for platform device structure's platform_data field */
+
+struct dn200_mdio_bus_data {
+	unsigned int phy_mask;
+	unsigned int has_xpcs;
+	unsigned int xpcs_an_inband;
+	int *irqs;
+	int probed_phy_irq;
+	bool needs_reset;
+};
+
+struct dn200_dma_cfg {
+	int pbl;
+	int txpbl;
+	int rxpbl;
+	bool pblx8;
+	int fixed_burst;
+	int mixed_burst;
+	bool aal;
+	bool onekbbe;
+	bool eame;
+	bool multi_msi_en;
+	bool dche;
+};
+
+#define AXI_BLEN	7
+struct dn200_axi {
+	bool axi_lpi_en;
+	bool axi_xit_frm;
+	u32 axi_wr_osr_lmt;
+	u32 axi_rd_osr_lmt;
+	bool axi_kbbe;
+	u32 axi_blen[AXI_BLEN];
+	bool axi_fb;
+	bool axi_mb;
+	bool axi_rb;
+};
+
+#define EST_GCL		1024
+struct dn200_est {
+	/*Avoid confinct between read with config*/
+	struct mutex lock;
+	int enable;
+	u32 btr_reserve[2];
+	u32 btr_offset[2];
+	u32 btr[2];
+	u32 ctr[2];
+	u32 ter;
+	u32 gcl_unaligned[EST_GCL];
+	u32 gcl[EST_GCL];
+	u32 gcl_size;
+};
+
+struct dn200_rxq_cfg {
+	u8 mode_to_use;
+	u32 chan;
+	u8 pkt_route;
+	bool use_prio;
+	u32 prio;
+	u32 weight;
+};
+
+struct dn200_txq_cfg {
+	u32 weight;
+	u8 mode_to_use;
+	/* Credit Base Shaper parameters */
+	u32 send_slope;
+	u32 idle_slope;
+	u32 high_credit;
+	u32 low_credit;
+	bool use_prio;
+	u32 prio;
+	int tbs_en;
+};
+
+/* FPE link state */
+enum dn200_fpe_state {
+	FPE_STATE_OFF = 0,
+	FPE_STATE_CAPABLE = 1,
+	FPE_STATE_ENTERING_ON = 2,
+	FPE_STATE_ON = 3,
+};
+
+/* FPE link-partner hand-shaking mPacket type */
+enum dn200_mpacket_type {
+	MPACKET_VERIFY = 0,
+	MPACKET_RESPONSE = 1,
+};
+
+enum dn200_fpe_task_state_t {
+	__FPE_REMOVING,
+	__FPE_TASK_SCHED,
+};
+
+struct dn200_fpe_cfg {
+	bool enable;		/* FPE enable */
+	bool hs_enable;		/* FPE handshake enable */
+	enum dn200_fpe_state lp_fpe_state;	/* Link Partner FPE state */
+	enum dn200_fpe_state lo_fpe_state;	/* Local station FPE state */
+};
+
+struct dn200_safety_feature_cfg {
+	u32 tsoee;
+	u32 mrxpee;
+	u32 mestee;
+	u32 mrxee;
+	u32 mtxee;
+	u32 epsi;
+	u32 edpp;
+	u32 prtyen;
+	u32 tmouten;
+};
+
+struct plat_dn200enet_data {
+	int bus_id;
+	int phy_addr;
+	int interface;
+	phy_interface_t phy_interface;
+	struct dn200_mdio_bus_data *mdio_bus_data;
+	struct device_node *phy_node;
+	struct device_node *phylink_node;
+	struct device_node *mdio_node;
+	struct dn200_dma_cfg *dma_cfg;
+	struct dn200_est *est;
+	struct dn200_fpe_cfg *fpe_cfg;
+	struct dn200_safety_feature_cfg *safety_feat_cfg;
+	int clk_csr;
+	int has_gmac;
+	int enh_desc;
+	int tx_coe;
+	int rx_coe;
+	int bugged_jumbo;
+	int force_sf_dma_mode;
+	int riwt_off;
+	int max_speed;
+	int maxmtu;
+	int multicast_filter_bins;
+	int unicast_filter_entries;
+	int tx_fifo_size;
+	int rx_fifo_size;
+	u32 addr64;
+	u32 rx_queues_to_use;
+	u32 tx_queues_to_use;
+	u8 rx_sched_algorithm;
+	u8 tx_sched_algorithm;
+	struct dn200_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
+	struct dn200_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
+	int (*init)(struct platform_device *pdev, void *priv);
+	void (*exit)(struct platform_device *pdev, void *priv);
+	struct mac_device_info *(*setup)(void *priv);
+	void (*dump_debug_regs)(void *priv);
+	void *bsp_priv;
+	struct clk *dn200_clk;
+	struct clk *pclk;
+	struct clk *clk_ptp_ref;
+	unsigned int clk_ptp_rate;
+	unsigned int clk_ref_rate;
+	unsigned int mult_fact_100ns;
+	s32 ptp_max_adj;
+	struct dn200_axi *axi;
+	int has_gmac4;
+	bool tso_en;
+	int rss_en;
+	int mac_port_sel_speed;
+	bool en_tx_lpi_clockgating;
+	bool rx_clk_runs_in_lpi;
+	int has_xgmac;
+	bool vlan_fail_q_en;
+	u8 vlan_fail_q;
+	unsigned int eee_usecs_rate;
+	struct pci_dev *pdev;
+	bool has_crossts;
+	int int_snapshot_num;
+	int ext_snapshot_num;
+	bool ext_snapshot_en;
+	bool multi_msi_en;
+	int msi_mac_vec;
+	int msi_lpi_vec;
+	int msi_sfty_ce_vec;
+	int msi_sfty_ue_vec;
+	int msi_rx_base_vec;
+	int msi_tx_base_vec;
+	bool sph_disable;
+};
+
+enum rxp_async_type {
+	DN200_VF_CLEAR_RXP = BIT(0),
+	DN200_VF_SET_UMAC = BIT(1),
+	DN200_VF_SET_FLT = BIT(2),
+	DN200_VF_APP_BC = BIT(3),
+};
+
+#define DN200_MAX_MC_ADDR_NUM	33
+struct dn200_vf_rxp_async_info {
+	u32 crc32;
+	u32 seq;
+	u16 uc_cnt;
+	u16 mc_cnt;
+	u8 rxq_start;
+	u8 vf_offset;
+	bool is_vf;
+	u8 type;
+	u64 flags;
+	u8 uc_mac_addr[ETH_ALEN];
+	u8 mc_mac_addr[DN200_MAX_MC_ADDR_NUM][ETH_ALEN];
+} __aligned(8);
+
+struct dn200_vf_rxp_async_wb {
+	u32 crc32;
+	u32 seq;
+	bool is_promisc;
+	bool is_allmuslt;
+	u8 uc_num;
+	u8 mc_num;
+} __aligned(8);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.c b/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.c
new file mode 100644
index 000000000000..39b6a76e2ac3
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.c
@@ -0,0 +1,2170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Chen Jisi <chenjisi@dapustor.com>
+ *
+ * Interactive configuration with the controller
+ */
+
+#include <linux/iopoll.h>
+#include "dn200_ctrl.h"
+#include "dn200.h"
+
+static int dn200_nvme_fw_cmd_exec(struct dn200_ctrl_resource *ctrl,
+	void *cmd, void *info, u32 len, u32 *ret_len, u32 time_out);
+
+static inline void dn200_lo_hi_writeq(__u64 val, void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, addr + 4);
+}
+
+static inline __u64 dn200_lo_hi_readq(void __iomem *addr)
+{
+	const u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl(p);
+	high = readl(p + 1);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void ctrl_update_cq_head(struct dn200_ctrl_resource *ctrl)
+{
+	u32 next_idx = 0;
+
+	next_idx = ctrl->cq_head + 1;
+	if (next_idx == ctrl->q_depth) {
+		ctrl->cq_head = 0;
+		ctrl->cq_phase ^= 1;
+	} else {
+		ctrl->cq_head = next_idx;
+	}
+}
+
+static void ctrl_submit_cmd(struct dn200_ctrl_resource *ctrl,
+			    struct ctrl_command *cmd)
+{
+	memcpy(ctrl->sq_cmds + (ctrl->sq_tail << ctrl->sqes),
+	       cmd, sizeof(*cmd));
+	ctrl->sq_tail = ctrl->sq_tail + 1;
+	if (ctrl->sq_tail == ctrl->q_depth)
+		ctrl->sq_tail = 0;
+	/*wmb to ensure sq_tail's value avail*/
+	wmb();
+	writel(ctrl->sq_tail, ctrl->dbs);
+}
+
+static inline void ctrl_process_cq(struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_completion *hcqe = &ctrl->cqes[ctrl->cq_head];
+
+	if ((le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == ctrl->cq_phase) {
+		ctrl->rdata[ctrl->cq_head] = le32_to_cpu(hcqe->result.u32);
+		ctrl_update_cq_head(ctrl);
+		writel(ctrl->cq_head, ctrl->bar + 0x1000 + 0x4);
+	}
+}
+
+static irqreturn_t ctrl_irq(int irq, void *data)
+{
+	struct dn200_ctrl_resource *ctrl = (struct dn200_ctrl_resource *)data;
+
+	dev_dbg(ctrl->dev,
+		"%s %d: irq=%d q_depth=%#x sq_tail=%#x cq_head=%#x\n",
+		__func__, __LINE__, irq, ctrl->q_depth, ctrl->sq_tail,
+		ctrl->cq_head);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_itr_upgrade(int irq, void *data)
+{
+	struct plat_dn200_data *plat_ex = (struct plat_dn200_data *)data;
+	struct dn200_priv *priv = plat_ex->priv_back;
+	u32 flags = 0;
+	u32 magic_num = 0;
+	u32 status = 0;
+
+	if (priv->flag_upgrade)
+		return IRQ_HANDLED;
+	if (!priv->plat_ex->upgrade_with_flowing)
+		return IRQ_HANDLED;
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, nic_st, &flags);
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, magic_num, &magic_num);
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, rsv, &status);
+	dev_dbg(priv->device,
+		"%s %d flags=0x%x, magic_num = 0x%x\n",
+		__func__, __LINE__, flags, magic_num);
+
+	if (flags == DN200_UNFINISH_FLAG) {
+		dev_dbg(priv->device, "[loading fw]dn200 load fw fail, for img copy to flash happened err\n");
+	} else if (flags == DN200_STOP_FLAG) {
+		/*here is necessary, for we need prevent other's action*/
+		/*need guarantee it only exec once*/
+		set_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		mod_timer(&priv->upgrade_timer, jiffies + msecs_to_jiffies(200));
+	} else if (flags == DN200_JMP_FAIL_FLAG) {
+		return IRQ_HANDLED; /*it not necessary,but give it*/
+	} else if (flags == DN200_START_FLAG) {
+		return IRQ_HANDLED; /*it not necessary,but give it*/
+	}
+	return IRQ_HANDLED;
+}
+static irqreturn_t dn200_itr_peer_noitfy(int irq, void *data)
+{
+	struct plat_dn200_data *plat_ex = (struct plat_dn200_data *)data;
+	struct dn200_priv *priv = plat_ex->priv_back;
+	u8 hw_reset = 0;
+	u8 flow_state = 0;
+	u8 mac_reset = 0;
+	u8 link_reset = 0;
+	u8 carrier_reset = 0;
+	u8 rxp_task;
+
+	DN200_ITR_SYNC_GET(priv->hw, itr_sync_app, HW_RESET_ID, &hw_reset);
+	if (PRIV_IS_VF(priv))
+		dev_dbg(priv->device, "%s, %d, peer to pf hw reset notify:%#x\n",
+			__func__, __LINE__, hw_reset);
+	if (hw_reset && !PRIV_IS_VF(priv)) {
+		/* pf process err reset notification from vf, run global err reset flow
+		 */
+		dev_dbg(priv->device, "%s, %d, pf process rst from vf, hw_reset state:%d\n",
+			__func__, __LINE__, hw_reset);
+		dn200_pf_glb_err_rst_process(priv);
+		DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, HW_RESET_ID, 0);
+	}
+
+	DN200_ITR_SYNC_GET(priv->hw, itr_sync_app, FLOW_STATE_ID, &flow_state);
+	if (flow_state && PRIV_IS_VF(priv)) {
+		DN200_ITR_SYNC_GET(priv->hw, vf_flow_state_event,
+			   DN200_VF_OFFSET_GET(priv->hw), &flow_state);
+		if (flow_state == FLOW_CLOSE_START) {
+			DN200_ITR_SYNC_SET(priv->hw, vf_flow_state_event,
+					DN200_VF_OFFSET_GET(priv->hw),
+					FLOW_CLOSE_DONE);
+			netif_carrier_off(priv->dev);
+			set_bit(DN200_VF_FLOW_CLOSE, &priv->state);
+		}
+		if (flow_state == FLOW_OPEN_START) {
+			DN200_ITR_SYNC_SET(priv->hw, vf_flow_state_event,
+					DN200_VF_OFFSET_GET(priv->hw),
+					FLOW_OPEN_DONE);
+			set_bit(DN200_VF_FLOW_OPEN_SET, &priv->state);
+		}
+		if (!test_and_set_bit(DN200_VF_FLOW_STATE_SET, &priv->state))
+			dn200_vf_work(priv);
+	}
+
+	if (PRIV_IS_VF(priv)) {
+		DN200_ITR_SYNC_GET(priv->hw, vf_reset_mac_list,
+				   priv->plat_ex->vf_offset, &mac_reset);
+		if (mac_reset) {
+			dn200_vf_mac_change(priv);
+			DN200_ITR_SYNC_SET(priv->hw, vf_reset_mac_list,
+					   priv->plat_ex->vf_offset, 0);
+		}
+		DN200_ITR_SYNC_GET(priv->hw, vf_link_list,
+				   priv->plat_ex->vf_offset, &link_reset);
+		if (link_reset) {
+			dn200_vf_link_set(priv, link_reset);
+			DN200_ITR_SYNC_SET(priv->hw, vf_reset_mac_list,
+					   priv->plat_ex->vf_offset, 0);
+		}
+
+		DN200_ITR_SYNC_GET(priv->hw, pf_carrier, 0, &carrier_reset);
+		netdev_dbg(priv->dev, "%s %d notify vf link off, get %d\n", __func__, __LINE__, carrier_reset);
+		if (carrier_reset) {
+			if (netif_running(priv->dev)) {
+				netif_carrier_off(priv->dev);
+				netif_tx_stop_all_queues(priv->dev);
+			}
+			DN200_ITR_SYNC_SET(priv->hw, vf_carrier, priv->plat_ex->vf_offset, 1);
+			netdev_dbg(priv->dev, "%s %d notify pf wb vf %d\n", __func__, __LINE__, priv->plat_ex->vf_offset);
+		}
+	}
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		DN200_ITR_SYNC_GET(priv->hw, itr_sync_app, RXP_TASK, &rxp_task);
+		dn200_async_rxp_work(priv);
+		DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, RXP_TASK, 0);
+	}
+	return IRQ_HANDLED;
+}
+
+static int ctrl_alloc_queue(struct dn200_ctrl_resource *ctrl)
+{
+	int ret = 0;
+
+	if (ctrl->addr64 > 32)
+		ret = dma_set_mask_and_coherent(ctrl->dev, DMA_BIT_MASK(32));
+	ctrl->cqes =
+	    dma_alloc_coherent(ctrl->dev,
+			       ctrl->q_depth * sizeof(struct ctrl_completion),
+			       &ctrl->cq_dma_addr, GFP_KERNEL);
+	if (!ctrl->cqes)
+		goto free_ctrlq;
+
+	ctrl->sq_cmds = dma_alloc_coherent(ctrl->dev, ctrl->q_depth * 64,
+					   &ctrl->sq_dma_addr, GFP_KERNEL);
+	if (!ctrl->sq_cmds)
+		goto free_cqdma;
+	if (ctrl->addr64 > 32)
+		ret = dma_set_mask_and_coherent(ctrl->dev, DMA_BIT_MASK(64));
+
+	return ret;
+
+free_cqdma:
+	dma_free_coherent(ctrl->dev,
+			  ctrl->q_depth * sizeof(struct ctrl_completion),
+			  (void *)ctrl->cqes, ctrl->cq_dma_addr);
+free_ctrlq:
+	return -ENOMEM;
+}
+
+static void ctrl_init_queue(struct dn200_ctrl_resource *ctrl)
+{
+	ctrl->sq_tail = 0;
+	ctrl->last_sq_tail = 0;
+	ctrl->cq_head = 0;
+	ctrl->cq_phase = 1;
+	memset((void *)ctrl->cqes, 0,
+	       ctrl->q_depth * sizeof(struct ctrl_completion));
+	wmb();			/* ensure the first interrupt sees the initialization */
+}
+
+static void ctrl_init(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl)
+{
+	memset(ctrl, 0, sizeof(*ctrl));
+	ctrl->pcie_ava = true;
+	ctrl->bar = pcim_iomap_table(pdev)[0];
+	ctrl->dev = &pdev->dev;
+	ctrl->q_depth = SQ_DEPTH;
+	ctrl->sqes = 6;
+	ctrl->cap = dn200_lo_hi_readq(ctrl->bar + REG_CAP);
+	ctrl->dbs = ctrl->bar + 4096;
+	ctrl->qid = 0;		/*only admin q */
+}
+
+static void shutdown_nvme_ctrl(struct dn200_ctrl_resource *ctrl)
+{
+	u32 config = 0;
+
+	config = readl(ctrl->bar + REG_CC);
+	config &= ~CTRL_CC_SHN_MASK;
+	config |= CTRL_CC_SHN_NORMAL;
+	writel(config, ctrl->bar + REG_CC);
+	mdelay(100);
+}
+
+void shutdown_ctrl(struct dn200_ctrl_resource *ctrl)
+{
+	struct msix_entry *msix_entries = ctrl->msix_entries;
+
+	if (ctrl->pcie_ava)
+		shutdown_nvme_ctrl(ctrl);
+	dma_free_coherent(ctrl->dev,
+			  ctrl->q_depth * sizeof(struct ctrl_completion),
+			  (void *)ctrl->cqes, ctrl->cq_dma_addr);
+	dma_free_coherent(ctrl->dev, ctrl->q_depth * 64, ctrl->sq_cmds,
+			  ctrl->sq_dma_addr);
+	synchronize_irq(msix_entries[0].vector);
+	devm_free_irq(ctrl->dev, msix_entries[0].vector, ctrl);
+}
+
+static void disable_ctrl(struct dn200_ctrl_resource *ctrl)
+{
+	u32 data;
+	int ret;
+
+	clear_bit(ADMIN_QUEUE_INITED, &ctrl->admin_state);
+
+	writel(0, ctrl->bar + REG_CC);	/*disable_ctrl */
+	if (ctrl->is_extern_phy)
+		ret = readl_poll_timeout(ctrl->bar +
+				REG_CSTS, data, !(data & BIT(0)), 100, 30000000);
+	else
+		ret = readl_poll_timeout_atomic(ctrl->bar +
+				REG_CSTS, data, !(data & BIT(0)), 100, 30000000);
+	if (ret)
+		dev_err(ctrl->dev, "func %s, line %d: ctrl cc disable timeout\n",
+			__func__, __LINE__);
+}
+
+static void enable_ctrl(struct dn200_ctrl_resource *ctrl)
+{
+	u32 config;
+
+	ctrl->cap = dn200_lo_hi_readq(ctrl->bar + REG_CAP);	/*enable ctrl */
+	if (((ctrl->cap >> 37) & 0xff) & CTRL_CAP_CSS_CSI)
+		config = CTRL_CC_CSS_CSI;
+	else
+		config = 0;
+	config |= CTRL_CC_IOSQES | CTRL_CC_IOCQES;
+
+	writel(config, ctrl->bar + REG_CC);
+}
+
+int dn200_ctrl_ccena(struct pci_dev *pdev, bool off, bool on, bool is_atomic)
+{
+	u32 data;
+	int ret0 = 0, ret1 = 0;
+
+	if (off) {
+		writel(readl(pcim_iomap_table(pdev)[0] + REG_CC) & 0xfffe,
+		       pcim_iomap_table(pdev)[0] + REG_CC);
+		if (is_atomic)
+			ret0 =
+				readl_poll_timeout_atomic(pcim_iomap_table(pdev)[0] +
+							REG_CSTS, data, !(data & BIT(0)),
+							100, 10000000);
+		else
+			ret0 =
+				readl_poll_timeout(pcim_iomap_table(pdev)[0] +
+							REG_CSTS, data, !(data & BIT(0)),
+							100, 30000000);
+	}
+	if (on) {
+		writel(readl(pcim_iomap_table(pdev)[0] + REG_CC) | 1,
+		       pcim_iomap_table(pdev)[0] + REG_CC);
+		if (is_atomic)
+			ret1 =
+				readl_poll_timeout_atomic(pcim_iomap_table(pdev)[0] +
+							REG_CSTS, data, (data & BIT(0)),
+							100, 30000000);
+		else
+			ret1 =
+				readl_poll_timeout(pcim_iomap_table(pdev)[0] +
+							REG_CSTS, data, (data & BIT(0)),
+							100, 30000000);
+	}
+	/* mask nvme intr pin */
+	writel(0xffffffff, pcim_iomap_table(pdev)[0] + REG_INTMS);
+	return ret0 || ret1;
+}
+
+int dn200_ena_msix_range(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl,
+			 int tx_queues_to_use, int rx_queues_to_use,
+			 bool is_purepf)
+{
+	int i, ret;
+	int num_vectors, total_vecs, irq_num;
+	struct msix_entry *msix_entries;
+	struct plat_dn200_data *plat_ex;
+
+	/* clear nvme intr mask */
+	writel(0xffffffff, ctrl->bar + REG_INTMC);
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	/*tx, rx, admin, others, notify, upgrade*/
+	num_vectors =
+	    tx_queues_to_use + rx_queues_to_use + 1 + (plat_ex->is_vf ? 0 : 4) +
+	    1 + 1;
+	if (num_vectors > pci_msix_vec_count(pdev)) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: num_vectors = %d, tx_queues_to_use = %d, rx_queues_to_use = %d.\n",
+			__func__, __LINE__, num_vectors, tx_queues_to_use,
+			rx_queues_to_use);
+		return DN200_FAILURE;
+	}
+
+	msix_entries =
+	    devm_kzalloc(ctrl->dev, (sizeof(struct msix_entry) * num_vectors),
+			 GFP_KERNEL);
+	if (!msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < num_vectors; i++)
+		msix_entries[i].entry = i;
+
+	total_vecs =
+	    pci_enable_msix_range(pdev, msix_entries, num_vectors, num_vectors);
+	if (total_vecs < num_vectors) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: alloc vec failed! total_vecs = %d, need %d\n",
+			__func__, __LINE__, total_vecs, num_vectors);
+		ret = total_vecs;
+		goto no_msix;
+	}
+	ctrl->msix_entries = msix_entries;
+
+	irq_num = msix_entries[0].vector;
+	memset(ctrl->itr_name_ctrl, 0, sizeof(ctrl->itr_name_ctrl));
+	sprintf(ctrl->itr_name_ctrl, "%s(%d:%d):%s-%d", DN200_RESOURCE_NAME,
+		PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
+		"ctrl", plat_ex->funcid);
+	ret =
+	    devm_request_irq(ctrl->dev, irq_num, ctrl_irq, 0,
+			     ctrl->itr_name_ctrl, ctrl);
+	if (ret) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: ctrl irq_num = %d register fail.\n",
+			__func__, __LINE__, irq_num);
+		goto err_ctrl_irq;
+	}
+
+	irq_num = msix_entries[num_vectors - 2].vector;
+	memset(ctrl->itr_name_peer_notify, 0,
+	       sizeof(ctrl->itr_name_peer_notify));
+	sprintf(ctrl->itr_name_peer_notify, "%s(%d:%d):%s-%d",
+		DN200_RESOURCE_NAME, PCI_SLOT(pdev->devfn),
+		PCI_FUNC(pdev->devfn), "notify", plat_ex->funcid);
+	ret =
+	    devm_request_irq(ctrl->dev, irq_num, dn200_itr_peer_noitfy, 0,
+			     ctrl->itr_name_peer_notify, plat_ex);
+	if (ret) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: notice irq_num = %d register fail.\n",
+			__func__, __LINE__, irq_num);
+		goto err_notice_irq;
+	}
+
+	irq_num = msix_entries[num_vectors - 1].vector;
+	memset(ctrl->itr_name_upgrade, 0,
+	       sizeof(ctrl->itr_name_upgrade));
+	sprintf(ctrl->itr_name_upgrade, "%s(%d:%d):%s-%d",
+		DN200_RESOURCE_NAME, PCI_SLOT(pdev->devfn),
+		PCI_FUNC(pdev->devfn), "upgrade", plat_ex->funcid);
+	ret =
+	    devm_request_irq(ctrl->dev, irq_num, dn200_itr_upgrade, 0,
+			     ctrl->itr_name_upgrade, plat_ex);
+	if (ret) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: notice irq_num = %d register fail.\n",
+			__func__, __LINE__, irq_num);
+		goto err_upgrade_irq;
+	}
+	plat_ex->total_irq = num_vectors;
+	return 0;
+err_upgrade_irq:
+	devm_free_irq(ctrl->dev, msix_entries[num_vectors - 1].vector, plat_ex);
+err_notice_irq:
+	devm_free_irq(ctrl->dev, msix_entries[num_vectors - 2].vector, plat_ex);
+err_ctrl_irq:
+	devm_free_irq(ctrl->dev, msix_entries[0].vector, ctrl);
+
+	pci_disable_msix(pdev);
+no_msix:
+	kfree(msix_entries);
+	return ret;
+}
+
+static inline bool nvme_cqe_pending(struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_completion *hcqe = &ctrl->cqes[ctrl->cq_head];
+
+	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == ctrl->cq_phase;
+}
+
+static inline int ctrl_poll_process_cq(struct dn200_ctrl_resource *ctrl, bool is_interupt, u32 delay_times)
+{
+	struct ctrl_completion *hcqe;
+	int found = 0;
+	int try = 0;
+	int ret = 0;
+
+	do {
+		if (is_interupt)
+			usleep_range(5, 10);
+		else
+			udelay(5);
+		try++;
+		while (nvme_cqe_pending(ctrl)) {
+			found++;
+			/* load-load control dependency between phase and the rest of
+			 * the cqe requires a full read memory barrier
+			 */
+			dma_rmb();
+			hcqe = &ctrl->cqes[ctrl->cq_head];
+			ctrl->rdata[ctrl->cq_head] =
+			    le64_to_cpu(hcqe->result.u64);
+			ctrl_update_cq_head(ctrl);
+		}
+		if (found && ctrl->cq_head == ctrl->sq_tail)
+			break;
+	} while (try < delay_times);
+	dev_dbg(ctrl->dev, "found=%d, try = %d\n", found, try);
+	if (found) {
+		writel(ctrl->cq_head, ctrl->dbs + 0x4);
+		return ret;
+	}
+
+	dev_dbg(ctrl->dev, "driver not found cq! func=%s, line=%d\n",
+		__func__, __LINE__);
+	return -EIO;
+}
+
+static void dn200_admin_process_atomic(struct dn200_ctrl_resource *ctrl,
+			 struct ctrl_command *c, int *ret_val, u32 *cq_val);
+static void dn200_admin_process(struct dn200_ctrl_resource *ctrl,
+			 struct ctrl_command *c, int *ret_val, u32 *cq_val);
+static const struct admin_process_ops dn200_admin_process_ops_atomic = {
+	.dn200_admin_process = dn200_admin_process_atomic,
+};
+
+static const struct admin_process_ops dn200_admin_process_ops = {
+	.dn200_admin_process = dn200_admin_process,
+};
+
+static int get_funcid(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl);
+int admin_queue_configure(struct pci_dev *pdev,
+			  struct dn200_ctrl_resource *ctrl, bool is_purepf, bool is_extern_phy)
+{
+	u32 aqa;
+	int ret;
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	ctrl_init(pdev, ctrl);
+	ctrl->is_extern_phy = is_extern_phy;
+	if ((readl(ctrl->bar + REG_CSTS) & CSTS_NSSRO))
+		writel(CSTS_NSSRO, ctrl->bar + REG_CSTS);
+
+	disable_ctrl(ctrl);
+
+	if (ctrl_alloc_queue(ctrl))
+		return -ENOMEM;
+
+	aqa = ctrl->q_depth - 1;
+	aqa |= aqa << 16;
+	writel(aqa, ctrl->bar + REG_AQA);
+	dn200_lo_hi_writeq(ctrl->sq_dma_addr, ctrl->bar + REG_ASQ);
+	dn200_lo_hi_writeq(ctrl->cq_dma_addr, ctrl->bar + REG_ACQ);
+
+	enable_ctrl(ctrl);
+
+	ctrl_init_queue(ctrl);
+
+	ret = dn200_ctrl_ccena(pdev, 1, 1, false);
+	if (ret) {
+		dev_err(ctrl->dev, "func %s, line %d: ctrl cc enable timeout\n",
+			__func__, __LINE__);
+		return ret;
+	}
+	set_bit(ADMIN_QUEUE_INITED, &ctrl->admin_state);
+
+	if (ctrl->is_extern_phy) {
+		mutex_init(&ctrl->mlock);
+		ctrl->dn200_admin_process_ops = &dn200_admin_process_ops;
+	} else {
+		spin_lock_init(&ctrl->lock);
+		ctrl->dn200_admin_process_ops = &dn200_admin_process_ops_atomic;
+	}
+	if (is_purepf) {
+		plat_ex->funcid = pdev->devfn;
+		return 0;
+	}
+	ret = get_funcid(pdev, ctrl);
+	if (ret < 0) {
+		dma_free_coherent(ctrl->dev,
+			  ctrl->q_depth * sizeof(struct ctrl_completion),
+			  (void *)ctrl->sq_cmds, ctrl->sq_dma_addr);
+		dma_free_coherent(ctrl->dev,
+			  ctrl->q_depth * sizeof(struct ctrl_completion),
+			  (void *)ctrl->cqes, ctrl->cq_dma_addr);
+
+		shutdown_nvme_ctrl(ctrl);
+	}
+	return ret;
+}
+
+int ctrl_reinitial(struct dn200_ctrl_resource *ctrl)
+{
+	int ret = 0;
+	u32 aqa;
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (!dn200_hwif_id_check(plat_ex->io_addr))
+		return -EIO;
+
+	aqa = ctrl->q_depth - 1;
+	aqa |= aqa << 16;
+	writel(aqa, ctrl->bar + REG_AQA);
+	dn200_lo_hi_writeq(ctrl->sq_dma_addr, ctrl->bar + REG_ASQ);
+	dn200_lo_hi_writeq(ctrl->cq_dma_addr, ctrl->bar + REG_ACQ);
+	enable_ctrl(ctrl);
+	ctrl_init_queue(ctrl);
+	if (ctrl->is_extern_phy)
+		ret = dn200_ctrl_ccena(plat_ex->pdev, 1, 1, false);
+	else
+		ret = dn200_ctrl_ccena(plat_ex->pdev, 1, 1, true);
+	if (ret) {
+		dev_err(ctrl->dev, "func %s, line %d:cc enable timeout\n",
+				__func__, __LINE__);
+		return ret;
+	}
+	udelay(10);
+	set_bit(ADMIN_QUEUE_INITED, &ctrl->admin_state);
+	return ret;
+}
+
+int ctrl_reset(struct dn200_ctrl_resource *ctrl, bool reset_irq)
+{
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (!dn200_hwif_id_check(plat_ex->io_addr))
+		return -EIO;
+	if (test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	shutdown_nvme_ctrl(ctrl);
+	disable_ctrl(ctrl);
+	return ctrl_reinitial(ctrl);
+}
+
+static void dn200_admin_process_atomic(struct dn200_ctrl_resource *ctrl,
+			 struct ctrl_command *c, int *ret_val, u32 *cq_val)
+{
+	unsigned long flags;
+	int idx = 0;
+
+	if (!test_bit(ADMIN_QUEUE_INITED, &ctrl->admin_state))
+		return;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	idx = ctrl->cq_head;
+	ctrl_submit_cmd(ctrl, c);
+	*ret_val = ctrl_poll_process_cq(ctrl, false, DN200_POLL_CQ_MAX_TIMES_ATOMIC);
+	*cq_val = ctrl->rdata[idx];
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+}
+
+static void dn200_admin_process(struct dn200_ctrl_resource *ctrl,
+			 struct ctrl_command *c, int *ret_val, u32 *cq_val)
+{
+	int idx = 0;
+
+	if (!test_bit(ADMIN_QUEUE_INITED, &ctrl->admin_state))
+		return;
+
+	mutex_lock(&ctrl->mlock);
+	idx = ctrl->cq_head;
+	ctrl_submit_cmd(ctrl, c);
+	*ret_val = ctrl_poll_process_cq(ctrl, true, DN200_POLL_CQ_MAX_TIMES);
+	*cq_val = ctrl->rdata[idx];
+	mutex_unlock(&ctrl->mlock);
+}
+
+int irq_queue_map(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl,
+		  int tx_queues_to_use, int rx_queues_to_use, bool need_retry)
+{
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	struct ctrl_command c = { };
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+
+	if (tx_queues_to_use == 0 || rx_queues_to_use == 0) {
+		dev_err(ctrl->dev,
+			"func %s, line %d: tx_queues_to_use =%d, rx_queues_to_use =%d\n",
+			__func__, __LINE__, tx_queues_to_use, rx_queues_to_use);
+		return DN200_FAILURE;
+	}
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+again:
+	memset(&c, 0, sizeof(c));
+	c.irq_share_command.opcode = ctrl_admin_vendor_start;
+	c.irq_share_command.set = IRQ_INFO_CONFIG;
+	c.irq_share_command.tx_q =
+	    tx_queues_to_use << 16 | plat_ex->tx_queue_start;
+	c.irq_share_command.rx_q =
+	    rx_queues_to_use << 16 | plat_ex->rx_queue_start;
+	c.irq_share_command.dword15 = 0;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret && need_retry) {
+		dev_err(&pdev->dev, "%s, %d, poll cq fail\n", __func__,
+			__LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, false);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+	}
+	return ret;
+}
+
+int irq_info_pfvf_release(struct pci_dev *pdev,
+			  struct dn200_ctrl_resource *ctrl, bool need_retry)
+{
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	struct ctrl_command c = { };
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.irq_release_command.opcode = ctrl_admin_vendor_start;
+	c.irq_release_command.set = IRQ_INFO_PFVF_RELEASE;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret && need_retry) {
+		dev_err(&pdev->dev, "%s, %d, poll cq fail\n", __func__,
+			__LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, false);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+	}
+	return ret;
+}
+
+static int get_funcid(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl)
+{
+	struct plat_dn200_data *plat_ex;
+	struct ctrl_command c = { };
+	int ret = 0;
+	int idx = 0;
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	idx = ctrl->cq_head;
+	c.irq_release_command.opcode = ctrl_admin_vendor_start;
+	c.irq_release_command.set = GET_FUNCID;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	plat_ex->funcid = cq_val;
+	if (ret) {
+		dev_err(&pdev->dev, "%s, %d, poll cq fail\n", __func__,
+			__LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, false);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+	}
+	return ret;
+}
+
+int fw_reg_write(struct dn200_ctrl_resource *ctrl, u32 reg, u32 value)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	struct plat_dn200_data *plat_ex = NULL;
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = REG_WRITE;
+	c.rw_command.addr = reg;
+	c.rw_command.value = value;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret) {
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		dev_err(&plat_ex->pdev->dev, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+int fw_reg_read(struct dn200_ctrl_resource *ctrl, u32 reg, u32 *value)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (plat_ex->priv_back && test_bit(DN200_PCIE_UNAVAILD, &plat_ex->priv_back->state))
+		return -EIO;
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = REG_READ;
+	c.rw_command.addr = reg;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, value);
+	if (ret) {
+		dev_err(&plat_ex->pdev->dev, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+int fw_link_state_set(struct dn200_ctrl_resource *ctrl, u8 link_state, u8 duplex, u32 speed)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	u32 cq_val;
+	u8 speed_sel = 0;
+	bool have_reset = false;
+
+	if (ctrl->is_extern_phy)
+		return 0;
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	while (speed) {
+		speed = speed / 10;
+		if (speed == 1)
+			break;
+		speed_sel++;
+	}
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.irq_release_command.opcode = ctrl_admin_vendor_start;
+	c.irq_release_command.set = FW_STATE_GET;
+	c.irq_release_command.dword13 |= (link_state & 0x1);
+	c.irq_release_command.dword13 |= ((duplex & 0x1) << 1);
+	c.irq_release_command.dword13 |= ((speed_sel & 0x3) << 2);
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+int lram_and_rxp_lock_and_unlock(struct dn200_ctrl_resource *ctrl, u32 value,
+				 u32 *ret_val)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = LRAM_RXP_LOCK;
+	c.rw_command.value = value;
+	c.rw_command.addr = plat_ex->pf_id;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	*ret_val = cq_val & 0xff;
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	if (*ret_val == 0xff)
+		dev_err(plat_ex->priv_back->device, "%s %d lock failed ! invalid cmd.\n",
+			__func__, __LINE__);
+	else
+		*ret_val &= 0x1;
+
+	return ret;
+}
+
+static int get_hw_type(struct dn200_ctrl_resource *ctrl, u32 *value, u32 type)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	int idx;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	idx = ctrl->cq_head;
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = type;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, value);
+	if (ret) {
+		dev_err(ctrl->dev, "%s, %d, poll cq fail\n", __func__,
+			__LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, false);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+	}
+	return ret;
+}
+
+int get_pcb_type(struct dn200_ctrl_resource *ctrl, u32 *value)
+{
+	return get_hw_type(ctrl, value, PCB_TYPE);
+}
+
+int get_rj45_type(struct dn200_ctrl_resource *ctrl, u32 *value)
+{
+	return get_hw_type(ctrl, value, RJ45_TYPE);
+}
+
+int dn200_led_blink_ctrl(struct dn200_ctrl_resource *ctrl, bool is_enable)
+{
+	struct ctrl_command c = { };
+	int ret = 0;
+	struct plat_dn200_data *plat_ex;
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_vendor_start;
+	c.common_command.cdw12 = FW_PWM_CTRL;
+	c.common_command.cdw13 = is_enable;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret) {
+		dev_err(ctrl->dev, "%s, %d, poll cq fail\n", __func__,
+			__LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+/**
+ * irq_peer_notify - notify peer(pf or vf) through hw interrupt.
+ *  @pdev : pcie device
+ *  @ctrl: ctrl structure
+ *  Description:
+ *  this is a fast notification function called by vf or pf to send msg to peer.
+ *  for pf, will send the hw interrupt to all vfs
+ *  for vf, just send the hw interrupt to pf
+ *  Return value:
+ *  0 on success and an appropriate (-)ve integer as defined in errno.h
+ *  file on failure.
+ */
+int irq_peer_notify(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_command c = { };
+	struct plat_dn200_data *plat_ex;
+	int ret = 0;
+	u32 cq_val = 0;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	c.irq_release_command.opcode = ctrl_admin_vendor_start;
+	c.irq_release_command.set = IRQ_INFO_NOTICE;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret) {
+		if (plat_ex->priv_back)
+			dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+				__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+/**
+ * dn200_ctrl_res_free - free ctrl all resources.
+ *  @pdev : pcie device
+ *  @ctrl: ctrl structure
+ *  Description:
+ *  this is the free function called by pcie remove, suppend, etc.
+ *  Return value:
+ *  0 on success and an appropriate (-)ve integer as defined in errno.h
+ *  file on failure.
+ */
+int dn200_ctrl_res_free(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl)
+{
+	int ret;
+	int peer_notify_vect;
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	peer_notify_vect = plat_ex->total_irq - 2;
+	if (ctrl->pcie_ava && dn200_hwif_id_check(plat_ex->io_addr)) {
+		ret = irq_info_pfvf_release(pdev, ctrl, true);
+		if (ret)
+			dev_err(&pdev->dev,
+				"func %s, line %d: pfvf_release fail\n",
+				__func__, __LINE__);
+	}
+	shutdown_ctrl(ctrl);
+	synchronize_irq(ctrl->msix_entries[peer_notify_vect].vector);
+	devm_free_irq(ctrl->dev, ctrl->msix_entries[peer_notify_vect].vector,
+		      plat_ex);
+
+	synchronize_irq(ctrl->msix_entries[peer_notify_vect + 1].vector);
+	devm_free_irq(ctrl->dev, ctrl->msix_entries[peer_notify_vect + 1].vector,
+		      plat_ex);
+	if (ctrl->is_extern_phy)
+		mutex_destroy(&ctrl->mlock);
+	return 0;
+}
+
+static void dn200_parse_fw_load_err(struct device *dev, int err_code)
+{
+	int i;
+	struct fw_load_err {
+		int err;
+		char *info;
+	} err_info[] = {
+		{0x4006, "the NIC system is busy"},
+		{0x400c, "the NIC upgrade in progress"},
+		{0x4002, "the NIC nvme parameter err"},
+		{0x4013, "the NIC nvme prp para err"},
+		{0x4007, "the NIC upgrade req abort/fw is invalid"},
+	};
+
+	for (i = 0; i < ARRAY_SIZE(err_info); i++) {
+		if (err_info[i].err == err_code) {
+			dev_err(dev, "[loading fw]%s, sf %#x.\n", err_info[i].info, err_code);
+			return;
+		}
+	}
+
+	dev_err(dev, "[loading fw]fw download cmd, sf %#x.\n", err_code);
+}
+
+int dn200_nvme_fw_load(struct dn200_ctrl_resource *ctrl, const char *fw, size_t fw_size)
+{
+	struct ctrl_command c = {};
+	int ret = 0;
+	int idx;
+	u32 pos, len, seg_len;
+	u32 cq_val = 0;
+	void *buf;
+	dma_addr_t addr;
+	struct page *page;
+	struct dn200_priv *priv;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	priv = plat_ex->priv_back;
+	page = dn200_alloc_dma_page_dir(priv, &addr, DMA_TO_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+
+	len = fw_size;
+	seg_len = CTRL_MAX_SEG_LEN;
+
+	for (pos = 0; pos < len; pos += seg_len) {
+		seg_len = min(seg_len, len - pos);
+		memcpy(buf, fw + pos, seg_len);
+
+		/*wmb to protect buf avail*/
+		wmb();
+		if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state)) {
+			dn200_free_dma_page_dir(priv, page, addr, DMA_TO_DEVICE);
+			return -EIO;
+		}
+again:
+		memset(&c, 0, sizeof(c));
+		c.fw_download_command.opcode = ctrl_admin_download_fw;
+		c.fw_download_command.dptr.prp1 = cpu_to_le64(addr);
+		c.fw_download_command.numd =
+			(cpu_to_le32(seg_len) >> 2) > 0 ?
+				((cpu_to_le32(seg_len) >> 2) - 1) :
+				0;
+		c.fw_download_command.ofst = cpu_to_le32(pos) >> 2;
+		ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+		if (ret) {
+			dev_err(ctrl->dev,
+				 "[loading fw]downloading, get cq fail.\n");
+			if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) &&
+					 !have_reset) {
+				ret = ctrl_reset(ctrl, true);
+				have_reset = true;
+				if (!ret)
+					goto again;
+			}
+			set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+			if (plat_ex->priv_back)
+				dn200_fw_err_dev_close(plat_ex->priv_back);
+			break;
+		}
+
+		idx = (ctrl->cq_head + ctrl->q_depth - 1) % ctrl->q_depth;
+		/* check status field */
+		ret = ctrl->cqes[idx].status >> 1;
+		if (ret) {
+			dn200_parse_fw_load_err(ctrl->dev, ret);
+			break;
+		}
+	}
+
+	dn200_free_dma_page_dir(priv, page, addr, DMA_TO_DEVICE);
+	return ret;
+}
+
+static int dn200_ctrl_check_cq(struct dn200_ctrl_resource *ctrl,
+	u32 interval, u32 times, struct ctrl_completion **cqe)
+{
+	struct ctrl_completion *hcqe;
+	ktime_t timeout;
+	int try = 0;
+	int ret = -EBUSY;
+
+	while (try++ < times) {
+		timeout = ktime_add_us(ktime_get(), interval);
+		while (ktime_compare(ktime_get(), timeout) < 0) {
+			if (ctrl->is_extern_phy)
+				usleep_range(1, 5);
+			else
+				udelay(1);
+		}
+
+		while (nvme_cqe_pending(ctrl)) {
+			dma_rmb();
+			hcqe = &ctrl->cqes[ctrl->cq_head];
+			ctrl->rdata[ctrl->cq_head] = le64_to_cpu(hcqe->result.u64);
+			if (cqe)
+				*cqe = hcqe;
+			ctrl_update_cq_head(ctrl);
+		}
+		if (ctrl->cq_head == ctrl->sq_tail) {
+			writel(ctrl->cq_head, ctrl->dbs + 0x4);
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int dn200_nvme_fw_slot_get(struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_command c = {};
+	int ret = -EBUSY;
+	u32 cq_val = 0;
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state))
+		return -EIO;
+
+	memset(&c, 0, sizeof(c));
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = FW_SLOT_GET;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		return ret;
+	}
+
+	return cq_val;
+}
+
+static int dn200_nvme_fw_get_state(struct dn200_ctrl_resource *ctrl, u32 *cq_val)
+{
+	struct ctrl_command c = {};
+	int ret = -EBUSY;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state))
+		return -EIO;
+again:
+	memset(&c, 0, sizeof(c));
+	c.rw_command.opcode = ctrl_admin_vendor_start;
+	c.rw_command.set = FW_UP_STATUS;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, cq_val);
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	return ret;
+}
+
+static int dn200_nvme_fw_commit_polling(struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_command c = {};
+	int ret = -EBUSY;
+	u32 cq_val = 0;
+	struct plat_dn200_data *plat_ex;
+	int slot = 2;
+	bool have_reset = false;
+	unsigned long in_time;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	dn200_nvme_fw_get_state(ctrl, &cq_val);
+	if ((cq_val >> 16) == DN200_FW_UPGRADE_COMIT_DOING) {
+		dev_err(plat_ex->priv_back->device, "%s, %d,cq_val %d fw commit is not idle\n",
+			__func__, __LINE__, cq_val);
+		return -EIO;
+	}
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state))
+		return -EIO;
+	/* update the backup slot */
+	slot = dn200_nvme_fw_slot_get(ctrl);
+	if (slot == 1 || slot == 2)
+		slot = 3 - slot;
+	else
+		slot = 2;
+
+	dev_info(plat_ex->priv_back->device, "commit fw to slot %d.\n", slot);
+again:
+	memset(&c, 0, sizeof(c));
+	c.fw_commit_command.opcode = ctrl_admin_vendor_start;
+	c.fw_commit_command.dword12 = FW_UP_COMMIT;
+	/* update to slot 2, -s 2 -a 1 */
+	c.fw_commit_command.dword13 = slot;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	/* when input invalid slot, output the slot val */
+	if (c.fw_commit_command.dword13 == cq_val) {
+		ret = -EINVAL;
+		return ret;
+	}
+	if (cq_val == 0xff) {
+		ret = -EOPNOTSUPP; /* fw not support the cmd, use old cmd */
+		return ret;
+	}
+	if (ret < 0) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+	if (ret)
+		return ret;
+
+	in_time = jiffies;
+	/* check status of update */
+	do {
+		if (time_after(jiffies, in_time + msecs_to_jiffies(20000))) {
+			dev_info(plat_ex->priv_back->device, "fw get state exceed time\n");
+			break;
+		}
+		msleep(1000);
+		ret = dn200_nvme_fw_get_state(ctrl, &cq_val);
+	} while (!ret && (cq_val >> 16) == DN200_FW_UPGRADE_COMIT_DOING);
+
+	if (!ret && (cq_val >> 16) == DN200_FW_UPGRADE_COMIT_FINISH)
+		return cq_val & 0xffff;
+	dev_err(plat_ex->priv_back->device, "%s, %d, dn200 can not get commit finish state\n",
+			__func__, __LINE__);
+	return ret;
+}
+
+int dn200_nvme_fw_commit(struct dn200_ctrl_resource *ctrl)
+{
+	struct ctrl_completion *hcqe;
+	struct ctrl_command c = {};
+	int ret = -EBUSY;
+	int slot = 2;
+	unsigned long flags = 0;
+	char *buf;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state))
+		return -EIO;
+
+	if (!ctrl->is_extern_phy)
+		return dn200_nvme_fw_commit_polling(ctrl);
+	/* update the backup slot */
+	slot = dn200_nvme_fw_slot_get(ctrl);
+	if (slot == 1 || slot == 2)
+		slot = 3 - slot;
+	else
+		slot = 2;
+
+	if (slot == 1) {
+		buf = vmalloc(CTRL_MAX_SEG_LEN);
+		if (test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state)) {
+			clear_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state);
+			(void)dn200_nvme_fw_cmd_exec(ctrl, "rom_protect 0", buf, CTRL_MAX_SEG_LEN, NULL, 0);
+			set_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state);
+		} else {
+			(void)dn200_nvme_fw_cmd_exec(ctrl, "rom_protect 0", buf, CTRL_MAX_SEG_LEN, NULL, 0);
+		}
+		vfree(buf);
+	}
+	dev_info(plat_ex->priv_back->device, "commit fw to slot %d.\n", slot);
+again:
+	if (ctrl->is_extern_phy)
+		mutex_lock(&ctrl->mlock);
+	else
+		spin_lock_irqsave(&ctrl->lock, flags);
+	memset(&c, 0, sizeof(c));
+	c.fw_commit_command.opcode = ctrl_admin_activate_fw;
+	/* The newly fw is activated at the next Controller Level Reset, -s 2 -a 1 */
+	if (plat_ex->upgrade_with_flowing) {
+		c.fw_commit_command.dword10 = (0x3 << 3) | slot;
+		ctrl_submit_cmd(ctrl, &c);
+		ret = 0;
+		if (ctrl->is_extern_phy)
+			mutex_unlock(&ctrl->mlock);
+		else
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+		return ret;
+	}
+
+	c.fw_commit_command.dword10 = (0x1 << 3) | slot;
+	ctrl_submit_cmd(ctrl, &c);
+	ret = dn200_ctrl_check_cq(ctrl, 1000, 30000, &hcqe);
+	if (ctrl->is_extern_phy)
+		mutex_unlock(&ctrl->mlock);
+	else
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+	if (ret) {
+		dev_err(ctrl->dev, "[loading fw]fw commit cmd, get cq fail.\n");
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) &&
+					!have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	} else {
+		ret = hcqe->status >> 1;
+	}
+	return ret;
+}
+
+int dn200_fw_i2c_rw_commit(struct dn200_ctrl_resource *ctrl,
+		 u32 dev_addr, u8 *value, u32 offset, bool rw)
+{
+	struct ctrl_command c = {};
+	int ret = -EBUSY;
+	int ret_val = 0;
+	u32 cq_val = 0;
+	int idx;
+	struct plat_dn200_data *plat_ex;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+again:
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+	memset(&c, 0, sizeof(c));
+	idx = ctrl->cq_head;
+	c.fw_i2c_command.opcode = ctrl_admin_vendor_start;
+	c.fw_i2c_command.set = FW_I2C_RW;
+	c.fw_i2c_command.dword13 = dev_addr;
+	c.fw_i2c_command.dword14 = rw;
+	if (rw)
+		c.fw_i2c_command.dword14 |= ((u32)(*value) << 16);
+	c.fw_i2c_command.dword15 = offset;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (!rw)
+		*value = cq_val & 0xff;
+	ret_val = (cq_val >> 24) & 0xff;
+	if (ret < 0) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n",
+			__func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	}
+
+	return ret < 0 ? ret : ret_val;
+}
+
+int dn200_nvme_product_info_get(struct dn200_ctrl_resource *ctrl, void *info, u32 len)
+{
+	dma_addr_t addr;
+	struct page *page;
+	void *buf;
+	u32 cq_val = 0;
+	struct ctrl_command c = {};
+	struct plat_dn200_data *plat_ex;
+	int ret = -EBUSY;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_FROM_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_prod_info_get;
+	c.common_command.dptr.prp1 = addr;
+	/* aligned to 4 */
+	c.common_command.cdw10 = cpu_to_le32(len + 3) >> 2;
+
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &cq_val);
+	if (ret)
+		dev_err(plat_ex->priv_back->device,
+			 "%s, %d, poll cq fail\n", __func__, __LINE__);
+	else
+		memcpy(info, buf, len);
+
+	dn200_free_dma_page_dir(plat_ex->priv_back,
+			 page, addr, DMA_FROM_DEVICE);
+	return ret;
+}
+
+static int dn200_nvme_fw_cmd_exec(struct dn200_ctrl_resource *ctrl,
+	void *cmd, void *info, u32 len, u32 *ret_len, u32 time_out)
+{
+	dma_addr_t addr, addr1 = 0;
+	dma_addr_t *prp_list = NULL;
+	struct page *page = NULL;
+	struct page *page1 = NULL;
+	struct page **page_list = NULL;
+	void *buf = NULL;
+	void *buf1 = NULL;
+	unsigned long flags = 0;
+	struct ctrl_command c = {};
+	struct plat_dn200_data *plat_ex;
+	int idx;
+	int ret = -EBUSY;
+	u32 read_len;
+	u32 entries;
+	u32 i, j;
+	u32 mem_len = 0;
+	bool have_reset = false;
+
+	/* cmd len limit 120 */
+	if (strlen(cmd) >= MAX_FW_CMD_LEN)
+		return -ENOMEM;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	/* send fw command */
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_TO_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+	strscpy(buf, cmd, MAX_FW_CMD_LEN);
+again:
+	if (ctrl->is_extern_phy)
+		mutex_lock(&ctrl->mlock);
+	else
+		spin_lock_irqsave(&ctrl->lock, flags);
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_fw_cmd_write;
+	c.common_command.dptr.prp1 = addr;
+	/* aligned to 4 */
+	c.common_command.cdw10 = (MAX_FW_CMD_LEN + 3) >> 2;
+	ctrl_submit_cmd(ctrl, &c);
+	/* default wait time 10s */
+	if (!time_out)
+		time_out = 30000;
+	ret = dn200_ctrl_check_cq(ctrl, 5, 200 * time_out, NULL);
+	idx = (ctrl->cq_head + ctrl->q_depth - 1) % ctrl->q_depth;
+
+	dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_TO_DEVICE);
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n", __func__, __LINE__);
+		if (ctrl->is_extern_phy)
+			mutex_unlock(&ctrl->mlock);
+		else
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+		return ret;
+	}
+
+	if (ret_len)
+		*ret_len = ctrl->rdata[idx];
+
+	/* receive fw command log */
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_FROM_DEVICE);
+	if (!page) {
+		if (ctrl->is_extern_phy)
+			mutex_unlock(&ctrl->mlock);
+		else
+			spin_unlock_irqrestore(&ctrl->lock, flags);
+		return -ENOMEM;
+	}
+	buf = page_address(page);
+
+	if (ctrl->rdata[idx] > CTRL_MAX_SEG_LEN) {
+		page1 = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr1, DMA_FROM_DEVICE);
+		if (!page1) {
+			dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_FROM_DEVICE);
+			if (ctrl->is_extern_phy)
+				mutex_unlock(&ctrl->mlock);
+			else
+				spin_unlock_irqrestore(&ctrl->lock, flags);
+			return -ENOMEM;
+		}
+		buf1 = page_address(page1);
+	}
+
+	read_len = ctrl->rdata[idx] > len ? len : ctrl->rdata[idx];
+	entries = ALIGN(read_len, CTRL_MAX_SEG_LEN) / CTRL_MAX_SEG_LEN;
+
+	if (entries > 2) {
+		prp_list = buf1;
+		page_list = vmalloc(entries * sizeof(page_list));
+		if (!page_list) {
+			if (ctrl->is_extern_phy)
+				mutex_unlock(&ctrl->mlock);
+			else
+				spin_unlock_irqrestore(&ctrl->lock, flags);
+			ret = -ENOMEM;
+			goto free_page;
+		}
+		for (i = 0; i < entries - 1; i++) {
+			page_list[i] = dn200_alloc_dma_page_dir(plat_ex->priv_back,
+					 &prp_list[i], DMA_FROM_DEVICE);
+			if (!page_list[i]) {
+				for (j = 0; j < i; j++)
+					dn200_free_dma_page_dir(plat_ex->priv_back,
+						 page_list[j], prp_list[j], DMA_FROM_DEVICE);
+
+				if (ctrl->is_extern_phy)
+					mutex_unlock(&ctrl->mlock);
+				else
+					spin_unlock_irqrestore(&ctrl->lock, flags);
+				vfree(page_list);
+				ret = -ENOMEM;
+				goto free_page;
+			}
+		}
+	}
+
+	have_reset = false;
+reagain:
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_fw_cmd_read;
+	c.common_command.dptr.prp1 = addr;
+	c.common_command.dptr.prp2 = addr1;
+	/* aligned to 4 */
+	c.common_command.cdw10 = (read_len + 3) >> 2;
+	ctrl_submit_cmd(ctrl, &c);
+	/* max wait time 10s */
+	ret = dn200_ctrl_check_cq(ctrl, 5, 2000000, NULL);
+	if (ctrl->is_extern_phy)
+		mutex_unlock(&ctrl->mlock);
+	else
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n", __func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret) {
+				if (ctrl->is_extern_phy)
+					mutex_lock(&ctrl->mlock);
+				else
+					spin_lock_irqsave(&ctrl->lock, flags);
+				goto reagain;
+			}
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	} else {
+		i = 0;
+		if (entries > 2) {
+			memcpy(info, buf, CTRL_MAX_SEG_LEN);
+			mem_len = CTRL_MAX_SEG_LEN;
+			while (mem_len < read_len) {
+				if (mem_len + CTRL_MAX_SEG_LEN <= read_len) {
+					memcpy(info + (i + 1) * CTRL_MAX_SEG_LEN,
+						 page_address(page_list[i]), CTRL_MAX_SEG_LEN);
+					mem_len += CTRL_MAX_SEG_LEN;
+				} else {
+					memcpy(info + (i + 1) * CTRL_MAX_SEG_LEN,
+						 page_address(page_list[i]), read_len - mem_len);
+					mem_len = read_len;
+				}
+				i++;
+			}
+		} else if (entries > 1) {
+			memcpy(info, buf, CTRL_MAX_SEG_LEN);
+			memcpy(info + CTRL_MAX_SEG_LEN, buf1, read_len - CTRL_MAX_SEG_LEN);
+		} else {
+			memcpy(info, buf, read_len);
+		}
+	}
+
+	if (entries > 2) {
+		for (i = 0; i < entries - 1; i++)
+			dn200_free_dma_page_dir(plat_ex->priv_back,
+				 page_list[i], prp_list[i], DMA_FROM_DEVICE);
+		vfree(page_list);
+	}
+free_page:
+	dn200_free_dma_page_dir(plat_ex->priv_back,
+		 page, addr, DMA_FROM_DEVICE);
+	if (ctrl->rdata[idx] > CTRL_MAX_SEG_LEN)
+		dn200_free_dma_page_dir(plat_ex->priv_back,
+			 page1, addr1, DMA_FROM_DEVICE);
+	return ret;
+}
+
+int dn200_dev_temp_get(struct dn200_ctrl_resource *ctrl, void *info, u32 len)
+{
+	return dn200_nvme_fw_cmd_exec(ctrl, "core_temp", info, len, NULL, 0);
+}
+
+static int dn200_nvme_dev_open(struct inode *inode, struct file *file)
+{
+	struct dn200_ctrl_resource *ctrl =
+		container_of(inode->i_cdev, struct dn200_ctrl_resource, cdev);
+
+	file->private_data = ctrl;
+	return 0;
+}
+
+static int dn200_nvme_dev_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static long dn200_nvme_dev_passthru(struct dn200_ctrl_resource *ctrl, unsigned long arg)
+{
+	struct ctrl_completion *hcqe;
+	struct dn200_user_passth_command command;
+	struct dn200_user_passth_command __user *ucmd = (void __user *)arg;
+	struct ctrl_command c;
+	void *buf = NULL;
+	dma_addr_t addr;
+	struct page *page = NULL;
+	unsigned long flags = 0;
+	unsigned int timeout = 0;
+	int ret = -EBUSY;
+	struct dn200_priv *priv;
+	struct plat_dn200_data *plat_ex;
+	enum dma_data_direction dma_dir = DMA_TO_DEVICE;
+	bool have_reset = false;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	priv = plat_ex->priv_back;
+
+	if (copy_from_user(&command, ucmd, sizeof(command)))
+		return -EFAULT;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) ||
+		 test_bit(ADMIN_UP_GRADE_FLAG, &ctrl->admin_state))
+		return -EIO;
+
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = command.opcode;
+	c.common_command.flags = command.flags;
+	c.common_command.cdw1 = cpu_to_le32(command.cdw1);
+	c.common_command.cdw2 = cpu_to_le32(command.cdw2);
+	c.common_command.cdw3 = cpu_to_le32(command.cdw3);
+	c.common_command.cdw10 = cpu_to_le32(command.cdw10);
+	c.common_command.cdw11 = cpu_to_le32(command.cdw11);
+	c.common_command.cdw12 = cpu_to_le32(command.cdw12);
+	c.common_command.cdw13 = cpu_to_le32(command.cdw13);
+	c.common_command.cdw14 = cpu_to_le32(command.cdw14);
+	c.common_command.cdw15 = cpu_to_le32(command.cdw15);
+
+	if (command.data_len > CTRL_MAX_SEG_LEN)
+		return -EFAULT;
+
+	if (command.opcode == ctrl_admin_fw_cmd_read ||
+		command.opcode == ctrl_admin_prod_info_get ||
+		command.rsvd1) {
+		dma_dir = DMA_FROM_DEVICE;
+	}
+
+	if (command.addr && command.data_len) {
+		page = dn200_alloc_dma_page_dir(priv, &addr, dma_dir);
+		if (!page)
+			return -ENOMEM;
+		buf = page_address(page);
+		if (dma_dir == DMA_TO_DEVICE &&
+			copy_from_user(buf, (void __user *)command.addr, command.data_len)) {
+			dn200_free_dma_page_dir(priv, page, addr, dma_dir);
+			dev_err(ctrl->dev, "[user command]copy from user fail, len %d.\n",
+					 command.data_len);
+			return -EFAULT;
+		}
+
+		c.common_command.dptr.prp1 = addr;
+		if (!c.common_command.cdw10) {
+			/* aligned to 4 */
+			c.common_command.cdw10 = (command.data_len + 3) >> 2;
+			/* fw download len is 0 base val */
+			if (command.opcode == ctrl_admin_download_fw && c.common_command.cdw10)
+				c.common_command.cdw10 -= 1;
+		}
+		c.common_command.cdw11 = (command.cdw11 + 3) >> 2;
+	}
+
+	timeout = command.timeout_ms;
+again:
+	if (ctrl->is_extern_phy)
+		mutex_lock(&ctrl->mlock);
+	else
+		spin_lock_irqsave(&ctrl->lock, flags);
+	ctrl_submit_cmd(ctrl, &c);
+
+	ret = dn200_ctrl_check_cq(ctrl, 1000, timeout, &hcqe);
+	if (ctrl->is_extern_phy)
+		mutex_unlock(&ctrl->mlock);
+	else
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	if (command.opcode == ctrl_admin_activate_fw)
+		return 0;
+
+	if (!ret && dma_dir == DMA_FROM_DEVICE &&
+		copy_to_user((void __user *)command.addr, buf, command.data_len)) {
+		dn200_free_dma_page_dir(priv, page, addr, dma_dir);
+		dev_err(ctrl->dev, "[user command]copy to user fail, len %d.\n", command.data_len);
+		return -EFAULT;
+	}
+
+	if (command.addr && command.data_len)
+		dn200_free_dma_page_dir(priv, page, addr, dma_dir);
+
+	if (ret) {
+		dev_err(plat_ex->priv_back->device, "%s, %d, poll cq fail\n", __func__, __LINE__);
+		if (!test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state) && !have_reset) {
+			ret = ctrl_reset(ctrl, true);
+			have_reset = true;
+			if (!ret)
+				goto again;
+		}
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &ctrl->admin_state);
+		if (plat_ex->priv_back)
+			dn200_fw_err_dev_close(plat_ex->priv_back);
+	} else {
+		ret = hcqe->status >> 1;
+		command.result = le64_to_cpu(hcqe->result.u64);
+		if (put_user(command.result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
+static long dn200_card_info_get(struct dn200_ctrl_resource *ctrl, unsigned long arg)
+{
+	struct plat_dn200_data *plat_ex;
+	struct net_device *ndev;
+	struct ethtool_link_ksettings cmd;
+	struct dn200_card_info info = {};
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	ndev = plat_ex->priv_back->dev;
+
+	memcpy(info.eth_name, ndev->name, IFNAMSIZ);
+
+	info.link = !!netif_carrier_ok(ndev);
+	if (ndev->ethtool_ops->get_link_ksettings(ndev, &cmd))
+		return -EINVAL;
+	info.speed = cmd.base.speed;
+	info.duplex = cmd.base.duplex;
+
+	if (dn200_dev_temp_get(ctrl, info.eth_temp, sizeof(info.eth_temp)))
+		sprintf(info.eth_temp, "Unknown!\n");
+
+	return copy_to_user((void __user *)arg, &info, sizeof(struct dn200_card_info));
+}
+
+static long dn200_nvme_fw_cmd_exec_info_get(struct dn200_ctrl_resource *ctrl, unsigned long arg)
+{
+	struct dn200_user_passth_command command;
+	struct dn200_user_passth_command __user *ucmd = (void __user *)arg;
+	char *buf;
+	char cmd[MAX_FW_CMD_LEN] = {};
+	int ret;
+
+	if (copy_from_user(&command, ucmd, sizeof(command)))
+		return -EFAULT;
+
+	if (command.metadata_len > MAX_FW_CMD_LEN)
+		return -ENOMEM;
+
+	if (copy_from_user(cmd, (void __user *)command.metadata, command.metadata_len)) {
+		dev_err(ctrl->dev, "[fw cmd exec]copy from user fail, len %d.\n",
+				 command.metadata_len);
+		return -EFAULT;
+	}
+
+	buf = vmalloc(ALIGN(command.data_len, CTRL_MAX_SEG_LEN));
+	if (!buf)
+		return -ENOMEM;
+
+	ret = dn200_nvme_fw_cmd_exec(ctrl, cmd, buf,
+		 command.data_len, (u32 *)&command.result, command.timeout_ms);
+	if (ret) {
+		vfree(buf);
+		return ret;
+	}
+
+	if (copy_to_user((void __user *)command.addr, buf, command.data_len)) {
+		dev_err(ctrl->dev, "[fw cmd exec]copy to user fail, len %d.\n",
+				 command.data_len);
+		vfree(buf);
+		return -EFAULT;
+	}
+	vfree(buf);
+	if (put_user(command.result, &ucmd->result)) {
+		dev_err(ctrl->dev, "[fw cmd exec]put user fail.\n");
+		return -EFAULT;
+	}
+	return 0;
+}
+
+int dn200_configure_timestamp(struct dn200_ctrl_resource *ctrl)
+{
+	__le64 ts;
+	struct ctrl_command c;
+	struct page *page;
+	void *buf;
+	dma_addr_t addr;
+	int ret, value;
+	struct plat_dn200_data *plat_ex;
+
+	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = dn200_admin_set_features;
+	c.features.fid = DN200_FEAT_TIMESTAMP;
+	c.features.dword11 = 0;
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_TO_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+	memcpy(buf, &ts, sizeof(ts));
+	c.features.dptr.prp1 = addr;
+
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &value);
+	dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_TO_DEVICE);
+	return ret;
+}
+
+int dn200_get_fw_ver(struct dn200_ctrl_resource *ctrl, struct dn200_ver *dn200_ver)
+{
+	struct ctrl_command c;
+	struct page *page;
+	void *buf;
+	dma_addr_t addr;
+	int ret, value;
+	struct plat_dn200_data *plat_ex;
+
+	memset(&c, 0, sizeof(c));
+	c.ctrl_identify.opcode = ctrl_admin_fw_identify;
+	c.ctrl_identify.cns = DN200_VER_CNS;
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_FROM_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+	c.ctrl_identify.dptr.prp1 = addr;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &value);
+	/*process ver*/
+	dn200_ver->type = *(u8 *)(buf + 64);
+	dn200_ver->product_type = *(u8 *)(buf + 65);
+	dn200_ver->rsv = *(u8 *)(buf + 66);
+	dn200_ver->is_fw = *(u8 *)(buf + 67);
+	dn200_ver->publish = *(u8 *)(buf + 68);
+	dn200_ver->number0 = *(u8 *)(buf + 69);
+	dn200_ver->number1 = *(u8 *)(buf + 70);
+	dn200_ver->number2 = *(u8 *)(buf + 71);
+	dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_FROM_DEVICE);
+	return ret;
+}
+
+int dn200_eeprom_read(struct dn200_ctrl_resource *ctrl, u32 offset, u32 len, u8 *data)
+{
+	void *buf = NULL;
+	dma_addr_t addr;
+	struct page *page = NULL;
+	struct plat_dn200_data *plat_ex;
+	int ret;
+	u32 val;
+	struct ctrl_command c;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_eeprom_read;
+	c.common_command.cdw12 = 0x01;
+	c.common_command.cdw13 = offset;
+	c.common_command.cdw10 = ALIGN(len, 4) >> 2;
+
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_FROM_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+
+	c.common_command.dptr.prp1 = addr;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &val);
+
+	if (!ret)
+		memcpy(data, buf, len);
+
+	dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_FROM_DEVICE);
+
+	return ret;
+}
+
+int dn200_eeprom_write(struct dn200_ctrl_resource *ctrl, u32 offset, u32 len, u8 *data)
+{
+	void *buf = NULL;
+	dma_addr_t addr;
+	struct page *page = NULL;
+	struct plat_dn200_data *plat_ex;
+	int ret;
+	u32 val;
+	struct ctrl_command c;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+
+	memset(&c, 0, sizeof(c));
+	c.common_command.opcode = ctrl_admin_eeprom_write;
+	c.common_command.cdw12 = 0x02;
+	c.common_command.cdw13 = offset;
+	c.common_command.cdw10 = ALIGN(len, 4) >> 2;
+
+	page = dn200_alloc_dma_page_dir(plat_ex->priv_back, &addr, DMA_TO_DEVICE);
+	if (!page)
+		return -ENOMEM;
+	buf = page_address(page);
+
+	memcpy(buf, data, len);
+
+	c.common_command.dptr.prp1 = addr;
+	ctrl->dn200_admin_process_ops->dn200_admin_process(ctrl, &c, &ret, &val);
+
+	dn200_free_dma_page_dir(plat_ex->priv_back, page, addr, DMA_TO_DEVICE);
+
+	return ret;
+}
+
+static long dn200_nvme_dev_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	long ret = 0;
+	struct dn200_ctrl_resource *ctrl = file->private_data;
+
+	switch (cmd) {
+	case DN200_NVME_PASSTHRU:
+		ret = dn200_nvme_dev_passthru(ctrl, arg);
+		break;
+	case DN200_NVME_GET_CARD_INFO:
+		ret = dn200_card_info_get(ctrl, arg);
+		break;
+	case DN200_NVME_GET_FW_CMD_LOG:
+		ret = dn200_nvme_fw_cmd_exec_info_get(ctrl, arg);
+		break;
+	default:
+		ret = dn200_nvme_dev_passthru(ctrl, arg);
+		break;
+	}
+
+	return ret;
+}
+
+static int dn200_nvme_mmap_bar4(struct file *file, struct vm_area_struct *vma)
+{
+	struct dn200_ctrl_resource *ctrl = file->private_data;
+	struct plat_dn200_data *plat_ex;
+	unsigned long base;
+	unsigned long size;
+	unsigned long offset;
+	unsigned long pfn;
+	unsigned long vsize;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	base = plat_ex->pdev->resource[4].start;
+	size = resource_size(&plat_ex->pdev->resource[4]);
+
+	offset = vma->vm_pgoff << PAGE_SHIFT;
+	pfn = (base + offset) >> PAGE_SHIFT;
+	vsize = vma->vm_end - vma->vm_start;
+
+	if (vsize > size || offset >= size)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	return remap_pfn_range(vma, vma->vm_start, pfn, vsize, vma->vm_page_prot);
+}
+
+static const struct file_operations dn200_nvme_ops = {
+	.owner		= THIS_MODULE,
+	.open		= dn200_nvme_dev_open,
+	.mmap		= dn200_nvme_mmap_bar4,
+	.release	= dn200_nvme_dev_release,
+	.unlocked_ioctl	= dn200_nvme_dev_ioctl,
+};
+
+static u32 dn200_nvme_major;
+static u32 dn200_nvme_num;
+static struct class *nvme_class;
+static unsigned long mask[BITS_TO_LONGS(MAX_NVME_NUM)];
+
+void dn200_register_nvme_device(struct dn200_ctrl_resource *ctrl)
+{
+	char dev_name[64] = {0};
+	struct plat_dn200_data *plat_ex;
+	dev_t devt;
+	unsigned long i;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (snprintf(dev_name, sizeof(dev_name), "dn200_nvme%x-%x-%x",
+		 pci_domain_nr(plat_ex->pdev->bus), plat_ex->pdev->bus->number, plat_ex->funcid) < 0) {
+		dev_err(plat_ex->priv_back->device, "nvme device name is too long\n");
+		return;
+	}
+	if (!dn200_nvme_major) {
+		if (alloc_chrdev_region(&devt, 0, MAX_NVME_NUM, dev_name) < 0) {
+			dev_err(plat_ex->priv_back->device, "alloc chrdev region fail\n");
+			return;
+		}
+		dn200_nvme_major = MAJOR(devt);
+
+		nvme_class = class_create("dn200_nvme");
+		if (IS_ERR(nvme_class)) {
+			dev_err(plat_ex->priv_back->device, "class creat fail\n");
+			goto err_chrdev_unreg;
+		}
+	}
+
+	for_each_clear_bit(i, mask, MAX_NVME_NUM) {
+		set_bit(i, mask);
+		ctrl->devt = MKDEV(dn200_nvme_major, i);
+		break;
+	}
+
+	dn200_nvme_num++;
+
+	cdev_init(&ctrl->cdev, &dn200_nvme_ops);
+
+	if (cdev_add(&ctrl->cdev, ctrl->devt, 1)) {
+		dev_err(plat_ex->priv_back->device, "cdev add fail\n");
+		goto err_class_release;
+	}
+
+	if (IS_ERR(device_create(nvme_class, NULL, ctrl->devt, NULL, dev_name))) {
+		dev_err(plat_ex->priv_back->device, "device create fail\n");
+		goto err_dev_del;
+	}
+	return;
+err_dev_del:
+	cdev_del(&ctrl->cdev);
+err_class_release:
+	class_destroy(nvme_class);
+err_chrdev_unreg:
+	unregister_chrdev_region(ctrl->devt, MAX_NVME_NUM);
+}
+
+void dn200_unregister_nvme_device(struct dn200_ctrl_resource *ctrl)
+{
+	if (nvme_class)
+		device_destroy(nvme_class, ctrl->devt);
+	cdev_del(&ctrl->cdev);
+	clear_bit(MINOR(ctrl->devt), mask);
+	dn200_nvme_num--;
+	if (nvme_class && !dn200_nvme_num) {
+		dn200_nvme_major = 0;
+		class_destroy(nvme_class);
+		unregister_chrdev_region(ctrl->devt, MAX_NVME_NUM);
+	}
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.h b/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.h
new file mode 100644
index 000000000000..59c77f440d60
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_ctrl.h
@@ -0,0 +1,431 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Chen Jisi <chenjisi@dapustor.com>
+ *
+ * Interactive configuration with the controller
+ */
+
+#ifndef __DN200_CTRL_H__
+#define __DN200_CTRL_H__
+// #include "common.h"
+#include <linux/ptp_clock_kernel.h>
+#include <linux/net_tstamp.h>
+#include <linux/reset.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
+#include <linux/clk-provider.h>
+#include <linux/pci.h>
+#include <linux/dmi.h>
+#include <linux/delay.h>
+#include <linux/if.h>
+
+#define SQ_DEPTH 32
+
+#define IRQ_INFO_PFVF_RELEASE 0x22
+#define GET_FUNCID 0x32
+#define IRQ_INFO_NOTICE 0x3a
+#define IRQ_INFO_CONFIG 0xc2
+#define REG_WRITE 0x8
+#define REG_READ 0x0
+#define PCB_TYPE 0x55
+#define RJ45_TYPE 0x65
+#define FW_UP_COMMIT 0x6d
+#define FW_UP_STATUS 0x75
+#define FW_SLOT_GET 0x7d
+#define LRAM_RXP_LOCK 0x1a
+#define FW_I2C_RW 0x92
+#define FW_PHY_RST 0x9a
+#define FW_PHY_EYE 0xba
+#define FW_PWM_CTRL 0xca
+#define FW_STATE_GET 0x5d
+
+#define ADMIN_TIMEOUT 1
+#define ADMIN_MAX_WAIT_TIME (2 * 1000)
+#define ADMIN_IRQ_RELEASE_MSEC 100
+#define DN200_POLL_CQ_MAX_TIMES 8000000
+#define DN200_POLL_CQ_MAX_TIMES_ATOMIC 100000
+
+#define CSTS_NSSRO BIT(4)
+#define CTRL_CAP_CSS_CSI BIT(6)
+#define CTRL_CC_CSS_CSI (6 << 4)
+#define CTRL_CC_CSS_NVM (0 << 4)
+#define CTRL_CC_IOSQES (6 << 16)
+#define CTRL_CC_IOCQES (4 << 20)
+#define CTRL_CC_ENABLE BIT(0)
+#define CTRL_CC_SHN_MASK (3 << 14)
+#define CTRL_CC_SHN_NORMAL BIT(14)
+#define CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
+#define CTRL_MAX_SEG_LEN 0x1000
+#define MAX_NVME_NUM 512
+#define MAX_FW_CMD_LEN 120
+#define DN200_NVME_PASSTHRU _IOWR('D', 0x10, struct dn200_user_passth_command)
+#define DN200_NVME_GET_CARD_INFO _IOR('D', 0x11, struct dn200_card_info)
+#define DN200_NVME_GET_FW_CMD_LOG _IOWR('D', 0x12, struct dn200_user_passth_command)
+
+#define dn200_admin_set_features 0x9
+#define DN200_FEAT_TIMESTAMP 0x0e
+#define DN200_VER_CNS 0x1
+
+#define DN200_FW_UPGRADE_COMIT_IDLE		(0)
+#define DN200_FW_UPGRADE_COMIT_DOING	(1)
+#define DN200_FW_UPGRADE_COMIT_FINISH	(2)
+
+struct dn200_ver {
+	u8 type;
+	u8 product_type;
+	u8 rsv;
+	u8 is_fw;
+	u8 publish;
+	u8 number0;
+	u8 number1;
+	u8 number2;
+};
+
+enum {
+	REG_READ_WAIT = 0,
+	REG_WRITE_WAIT,
+	REG_READ_FAST,
+	REG_WRITE_FAST,
+};
+
+union ctrl_data_ptr {
+	struct {
+		__le64 prp1;
+		__le64 prp2;
+	};
+};
+
+struct ctrl_features {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 nsid;
+	__u64 rsvd2[2];
+	union ctrl_data_ptr dptr;
+	__le32 fid;
+	__le32 dword11;
+	__le32 dword12;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+struct ctrl_identify {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 nsid;
+	__u64 rsvd2[2];
+	union ctrl_data_ptr dptr;
+	__le32 cns;
+	__le32 dword11;
+	__le32 dword12;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+enum ctrl_admin_dn200_opcode {
+	ctrl_admin_activate_fw = 0x10,
+	ctrl_admin_download_fw = 0x11,
+	ctrl_admin_vendor_start = 0xC0,
+	ctrl_admin_prod_info_set = 0xC1,
+	ctrl_admin_prod_info_get = 0xC2,
+	ctrl_admin_fw_cmd_write = 0xCD,
+	ctrl_admin_fw_cmd_read = 0xCE,
+	ctrl_admin_fw_identify = 0x6,
+	ctrl_admin_eeprom_write = 0xD9,
+	ctrl_admin_eeprom_read = 0xDA,
+};
+
+struct dn200_reg_rw_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 nsid;
+	__le32 cdw3;
+	__le32 cdw4;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 ndt;
+	__le32 ndm;
+	__le32 set;
+	__le32 addr;
+	__le32 value;
+	__le32 mask;
+};
+
+struct dn200_irq_share_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 nsid;
+	__le32 cdw3;
+	__le32 cdw4;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 ndt;
+	__le32 ndm;
+	__le32 set;
+	__le32 tx_q;
+	__le32 rx_q;
+	__le32 dword15;
+};
+
+struct dn200_irq_release_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 nsid;
+	__le32 cdw3;
+	__le32 cdw4;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 ndt;
+	__le32 ndm;
+	__le32 set;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+struct dn200_fw_download_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 cdw1;
+	__le32 cdw2;
+	__le32 cdw3;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 numd;
+	__le32 ofst;
+	__le32 dword12;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+struct dn200_fw_commit_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 cdw1;
+	__le32 cdw2;
+	__le32 cdw3;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 dword10;
+	__le32 dword11;
+	__le32 dword12;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+struct dn200_fw_i2c_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 cdw1;
+	__le32 cdw2;
+	__le32 cdw3;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 dword10;
+	__le32 dword11;
+	__le32 set;
+	__le32 dword13;
+	__le32 dword14;
+	__le32 dword15;
+};
+
+struct dn200_common_command {
+	__u8 opcode;
+	__u8 flags;
+	__u16 command_id;
+	__le32 cdw1;
+	__le32 cdw2;
+	__le32 cdw3;
+	__le64 metadata;
+	union ctrl_data_ptr dptr;
+	__le32 cdw10;
+	__le32 cdw11;
+	__le32 cdw12;
+	__le32 cdw13;
+	__le32 cdw14;
+	__le32 cdw15;
+};
+
+struct dn200_user_passth_command {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	cdw1;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32   rsvd2;
+	__u64	result;
+};
+
+struct ctrl_command {
+	union {
+		struct ctrl_identify ctrl_identify;
+		struct ctrl_features features;
+		struct dn200_reg_rw_command rw_command;
+		struct dn200_irq_share_command irq_share_command;
+		struct dn200_irq_release_command irq_release_command;
+		struct dn200_fw_download_command fw_download_command;
+		struct dn200_fw_commit_command fw_commit_command;
+		struct dn200_fw_i2c_command fw_i2c_command;
+		struct dn200_common_command common_command;
+	};
+};
+
+enum {
+	REG_CAP = 0x0000,	/* Controller Capabilities */
+	REG_VS = 0x0008,	/* Version */
+	REG_INTMS = 0x000c,	/* Interrupt Mask Set */
+	REG_INTMC = 0x0010,	/* Interrupt Mask Clear */
+	REG_CC = 0x0014,	/* Controller Configuration */
+	REG_CSTS = 0x001c,	/* Controller Status */
+	REG_NSSR = 0x0020,	/* NVM Subsystem Reset */
+	REG_AQA = 0x0024,	/* Admin Queue Attributes */
+	REG_ASQ = 0x0028,	/* Admin SQ Base Address */
+	REG_ACQ = 0x0030,	/* Admin CQ Base Address */
+	REG_CMBLOC = 0x0038,	/* Controller Memory Buffer Location */
+	REG_CMBSZ = 0x003c,	/* Controller Memory Buffer Size */
+	REG_BPINFO = 0x0040,	/* Boot Partition Information */
+	REG_BPRSEL = 0x0044,	/* Boot Partition Read Select */
+	REG_BPMBL = 0x0048,	/* Boot Partition Memory Buffer* Location */
+	REG_CMBMSC = 0x0050,	/* Controller Memory Buffer Memory* Space Control */
+	REG_PMRCAP = 0x0e00,	/* Persistent Memory Capabilities */
+	REG_PMRCTL = 0x0e04,	/* Persistent Memory Region Control */
+	REG_PMRSTS = 0x0e08,	/* Persistent Memory Region Status */
+	REG_PMREBS = 0x0e0c,	/* Persistent Memory Region Elasticity* Buffer Size */
+	REG_PMRSWTP = 0x0e10,	/* Persistent Memory Region Sustained* Write Throughput */
+	REG_DBS = 0x1000,	/* SQ 0 Tail Doorbell */
+};
+
+struct ctrl_completion {
+	/* Used by Admin and Fabrics commands to return data: */
+	union ctrl_result {
+		__le16 u16;
+		__le32 u32;
+		__le64 u64;
+	} result;
+	__le16 sq_head;		/* how much of this queue may be reclaimed */
+	__le16 sq_id;		/* submission queue that generated this entry */
+	__u16 command_id;	/* of the command which completed */
+	__le16 status;		/* did the command fail, and if so, why? */
+};
+
+struct dn200_ctrl_resource {
+	struct device *dev;
+	struct ctrl_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	void *sq_cmds;
+	u64 cap;
+	u32 q_depth;
+	u16 cq_vector;
+	u16 sq_tail;
+	u16 last_sq_tail;
+	u16 cq_head;
+	u16 qid;
+	u8 cq_phase;
+	u8 sqes;
+	u32 db_stride;
+	u8 funcid;
+	void __iomem *dbs;
+	void __iomem *bar;
+	unsigned long flags;
+	struct msix_entry *msix_entries;
+	struct timer_list ctrl_timer;
+	const struct dn200_ctrl_ops *ctrl_ops;
+	char itr_name_ctrl[IFNAMSIZ + 9];
+	char itr_name_peer_notify[IFNAMSIZ + 9];
+	char itr_name_upgrade[IFNAMSIZ + 9];
+	const struct admin_process_ops *dn200_admin_process_ops;
+	u32 rdata[SQ_DEPTH];
+	spinlock_t lock; /* dn200 ctrl spinlock. */
+	struct mutex mlock;
+	unsigned long admin_state;
+	u8 addr64;
+	dev_t devt;
+	struct cdev cdev;
+	bool pcie_ava;
+	bool is_extern_phy;
+};
+
+enum admin_queue_state {
+	ADMIN_QUEUE_IDLE,
+	ADMIN_QUEUE_CQ_NO_RETURN,
+	ADMIN_QUEUE_INITED,
+	ADMIN_UP_GRADE_FLAG,
+};
+
+struct dn200_card_info {
+	u32 link;
+	u32 speed;
+	u32 duplex;
+	u32 type;
+	char eth_name[64];
+	char eth_temp[64];
+};
+
+struct admin_process_ops {
+	void (*dn200_admin_process)(struct dn200_ctrl_resource *ctrl,
+			 struct ctrl_command *c, int *ret_val, u32 *cq_val);
+};
+
+int ctrl_reset(struct dn200_ctrl_resource *ctrl, bool reset_irq);
+int admin_queue_configure(struct pci_dev *pdev,
+			  struct dn200_ctrl_resource *ctrl, bool is_purepf, bool is_extern_phy);
+void shutdown_ctrl(struct dn200_ctrl_resource *ctrl);
+int irq_info_pfvf_release(struct pci_dev *pdev,
+			  struct dn200_ctrl_resource *ctrl, bool need_retry);
+int dn200_ena_msix_range(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl,
+			 int tx_queues_to_use, int rx_queues_to_use,
+			 bool is_purepf);
+int irq_queue_map(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl,
+		  int tx_queues_to_use, int rx_queues_to_use, bool need_retry);
+int dn200_ctrl_ccena(struct pci_dev *pdev, bool off, bool on, bool is_atomic);
+int fw_reg_write(struct dn200_ctrl_resource *ctrl, u32 reg, u32 value);
+int fw_reg_read(struct dn200_ctrl_resource *ctrl, u32 reg, u32 *value);
+int irq_peer_notify(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl);
+int dn200_ctrl_res_free(struct pci_dev *pdev, struct dn200_ctrl_resource *ctrl);
+int get_pcb_type(struct dn200_ctrl_resource *ctrl, u32 *value);
+int get_rj45_type(struct dn200_ctrl_resource *ctrl, u32 *value);
+int lram_and_rxp_lock_and_unlock(struct dn200_ctrl_resource *ctrl, u32 value,
+				 u32 *ret_val);
+int dn200_nvme_fw_load(struct dn200_ctrl_resource *ctrl, const char *fw,
+		       size_t fw_size);
+int dn200_nvme_fw_commit(struct dn200_ctrl_resource *ctrl);
+int dn200_fw_i2c_rw_commit(struct dn200_ctrl_resource *ctrl, u32 dev_addr,
+			   u8 *value, u32 offset, bool rw);
+int fw_phy_set(struct dn200_ctrl_resource *ctrl, bool type);
+int dn200_nvme_product_info_get(struct dn200_ctrl_resource *ctrl, void *info, u32 len);
+int dn200_dev_temp_get(struct dn200_ctrl_resource *ctrl, void *info, u32 len);
+void dn200_register_nvme_device(struct dn200_ctrl_resource *ctrl);
+void dn200_unregister_nvme_device(struct dn200_ctrl_resource *ctrl);
+int dn200_led_blink_ctrl(struct dn200_ctrl_resource *ctrl, bool is_enable);
+int fw_link_state_set(struct dn200_ctrl_resource *ctrl, u8 link_state, u8 duplex, u32 speed);
+int dn200_configure_timestamp(struct dn200_ctrl_resource *ctrl);
+int dn200_get_fw_ver(struct dn200_ctrl_resource *ctrl, struct dn200_ver *dn200_ver);
+int dn200_eeprom_read(struct dn200_ctrl_resource *ctrl, u32 offset, u32 len, u8 *data);
+int dn200_eeprom_write(struct dn200_ctrl_resource *ctrl, u32 offset, u32 len, u8 *data);
+int ctrl_reinitial(struct dn200_ctrl_resource *ctrl);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_dcb.c b/drivers/net/ethernet/dapustor/dn200/dn200_dcb.c
new file mode 100644
index 000000000000..9920f743d583
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_dcb.c
@@ -0,0 +1,913 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+#include <linux/bitops.h>
+#include <linux/ethtool.h>
+#include "dn200_dcb.h"
+
+#define DN200_DMA_STOP_TIMEOUT				1
+#define DN200_DCB_MAX_FLOW_CONTROL_QUEUES	8
+#define DN200_FIFO_MIN_ALLOC				2048
+
+#define DN200_PRIO_QUEUES(_cnt)		min_t(unsigned int, IEEE_8021QAZ_MAX_TCS, (_cnt))
+#define DN200_FIFO_UNIT				256
+
+static u8 dn200_dcb_getdcbx(struct net_device *netdev)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	netif_dbg(priv, drv, netdev, "(%s) get dcbx\n", __func__);
+	return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+}
+
+static u8 dn200_dcb_setdcbx(struct net_device *netdev, u8 dcbx)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+
+	netif_dbg(priv, drv, netdev, "set DCBX=%#x\n", dcbx);
+
+	if (dcbx & ~support)
+		return 1;
+
+	if ((dcbx & support) != support)
+		return 1;
+
+	return 0;
+}
+
+static void dn200_config_queue_mapping(struct dn200_priv *priv)
+{
+	u32 queue;
+
+	/* Map the TxQ to Traffic Class in dn200_dma_tx_mode(), like
+	 * TxQ[0,7]->TC[0,7].
+	 */
+	for (queue = 0; queue < priv->plat->tx_queues_to_use; queue++)
+		priv->q2tc_map[queue] = queue;
+
+	/* Map the 8 VLAN priority values to available Rx Queues in
+	 * dn200_mac_config_rx_queues_prio()
+	 */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++)
+		priv->prio2q_map[queue] = queue;
+
+	/**
+	 * Set static mapping in dn200_rx_queue_dma_chan_map(), like
+	 * RxQ[0,7]->channel[0,7] in dn200_default_data(). However
+	 * dynamic mapping will most likely be configured instead of
+	 * static mapping, such as RSS, RXP and etc.
+	 */
+}
+
+static void dn200_config_tc(struct dn200_priv *priv)
+{
+	unsigned int offset, queue, prio;
+	u8 i;
+
+	netdev_reset_tc(priv->dev);
+	if (!priv->num_tcs)
+		return;
+
+	netdev_set_num_tc(priv->dev, priv->num_tcs);
+	netif_dbg(priv, drv, priv->dev, "num_tc %d\n", priv->dev->num_tc);
+
+	for (i = 0, queue = 0, offset = 0; i < priv->num_tcs; i++) {
+		while ((queue < priv->plat->tx_queues_to_use) &&
+		       (priv->q2tc_map[queue] == i))
+			queue++;
+
+		netif_dbg(priv, drv, priv->dev, "TC%u using TXq%u-%u\n",
+			  i, offset, queue - 1);
+		netdev_set_tc_queue(priv->dev, i, queue - offset, offset);
+		offset = queue;
+	}
+
+	if (!priv->ets)
+		return;
+
+	for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++) {
+		netdev_set_prio_tc_map(priv->dev, prio,
+				       priv->ets->prio_tc[prio]);
+		netif_dbg(priv, drv, priv->dev, "prio %d assigned to tc %d\n",
+			  prio, priv->ets->prio_tc[prio]);
+	}
+}
+
+static void dn200_config_dcb_tc(struct dn200_priv *priv)
+{
+	struct ieee_ets *ets = priv->ets;
+	unsigned int total_weight, min_weight, weight;
+	unsigned int mask;
+	unsigned int i, prio;
+	u32 value;
+
+	if (!ets)
+		return;
+
+	/* Set Tx to deficit weighted round robin scheduling algorithm (when
+	 * traffic class is using ETS algorithm)
+	 */
+	value = readl(priv->ioaddr + XGMAC_MTL_OPMODE);
+	value &= ~XGMAC_ETSALG;
+	value |= XGMAC_DWRR;
+	writel(value, priv->ioaddr + XGMAC_MTL_OPMODE);
+
+	/* Set Traffic Class algorithms */
+	total_weight = priv->dev->mtu * priv->dma_cap.tc_cnt;
+	min_weight = total_weight / 100;
+	if (!min_weight)
+		min_weight = 1;
+
+	/* Initialize prio2tc bitmap to 0 */
+	for (i = 0; i < 8; i++)
+		priv->prio2tc_bitmap[i] = 0;
+
+	for (i = 0; i < priv->dma_cap.tc_cnt; i++) {
+		/* Map the priorities to the traffic class */
+		mask = 0;
+		for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++) {
+			if (ets->prio_tc[prio] == i) {
+				mask |= (1 << prio);
+				priv->prio2tc_bitmap[prio] |= (1 << i);
+			}
+		}
+		mask &= 0xff;
+		netif_dbg(priv, drv, priv->dev, "TC%u PRIO mask=%#x\n", i,
+			  mask);
+
+		/* Map priorities to the traffic class */
+		value =
+		    readl(priv->ioaddr + XGMAC_TC_PRTY_MAP0 + 0x4 * (i / 4));
+		value &= ~XGMAC_PSTC(i % 4);
+		value |= (mask << XGMAC_PSTC_SHIFT(i % 4)) & XGMAC_PSTC(i % 4);
+		writel(value,
+		       priv->ioaddr + XGMAC_TC_PRTY_MAP0 + 0x4 * (i / 4));
+		netif_dbg(priv, drv, priv->dev,
+			  "REG[TC_PRTY_MAP%d] value=%08x\n", i / 4, value);
+
+		/* Set the traffic class algorithm */
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			netif_dbg(priv, drv, priv->dev, "TC%u using SP\n", i);
+			value =
+			    readl(priv->ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+			value &= ~XGMAC_TSA;
+			value |= XGMAC_SP;
+			writel(value,
+			       priv->ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			weight = total_weight * ets->tc_tx_bw[i] / 100;
+			weight = clamp(weight, min_weight, total_weight);
+			netif_dbg(priv, drv, priv->dev,
+				  "TC%u using DWRR (weight %u)\n", i, weight);
+			/* Must set ETS if algorithm is WRR, WFQ or DWRR */
+			value =
+			    readl(priv->ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+			value &= ~XGMAC_TSA;
+			value |= XGMAC_ETS;
+			writel(value,
+			       priv->ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+			/* Set TC quantum */
+			writel(weight,
+			       priv->ioaddr + XGMAC_MTL_TCx_QUANTUM_WEIGHT(i));
+			break;
+		default:
+			break;
+		}
+	}
+
+	dn200_config_tc(priv);
+}
+
+static int dn200_dcb_ieee_getets(struct net_device *netdev,
+				 struct ieee_ets *ets)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	/* Set number of supported traffic classes */
+	ets->ets_cap = priv->dma_cap.tc_cnt;
+
+	if (priv->ets) {
+		ets->cbs = priv->ets->cbs;
+		memcpy(ets->tc_tx_bw, priv->ets->tc_tx_bw,
+		       sizeof(ets->tc_tx_bw));
+		memcpy(ets->tc_tsa, priv->ets->tc_tsa, sizeof(ets->tc_tsa));
+		memcpy(ets->prio_tc, priv->ets->prio_tc, sizeof(ets->prio_tc));
+	}
+
+	netif_dbg(priv, drv, netdev, "(%s) get ets\n", __func__);
+	return 0;
+}
+
+static int dn200_dcb_ieee_setets(struct net_device *netdev,
+				 struct ieee_ets *ets)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	unsigned int i, tc_ets, tc_ets_weight;
+	u8 max_tc = 0;
+	u8 support = dn200_dcb_getdcbx(netdev);
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	if (!priv->ets) {
+		priv->ets = devm_kzalloc(priv->device, sizeof(*priv->ets),
+					 GFP_KERNEL);
+		if (!priv->ets)
+			return -ENOMEM;
+	}
+	/* if don't change any ets settings, don't go on */
+	if (memcmp(priv->ets, ets, sizeof(*priv->ets)) == 0)
+		return 0;
+	memcpy(priv->ets, ets, sizeof(*priv->ets));
+
+	tc_ets = 0;
+	tc_ets_weight = 0;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		max_tc = max_t(u8, max_tc, ets->prio_tc[i]);
+		if ((ets->tc_tx_bw[i] || ets->tc_tsa[i]))
+			max_tc = max_t(u8, max_tc, i);
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			tc_ets = 1;
+			tc_ets_weight += ets->tc_tx_bw[i];
+			break;
+		default:
+			/**
+			 * Hardware only supports priority strict or
+			 * ETS transmission selection algorithms if
+			 * we receive some other value from dcbnl
+			 * throw an error
+			 */
+			netif_warn(priv, drv, netdev,
+				   "unsupported TSA algorithm (%hhuu)\n",
+				   ets->tc_tsa[i]);
+			return -EINVAL;
+		}
+	}
+
+	/* Check maximum traffic class requested */
+	if (max_tc >= priv->dma_cap.tc_cnt) {
+		netif_warn(priv, drv, netdev,
+			   "exceeded number of supported traffic classes\n");
+		return -EINVAL;
+	}
+
+	/* Weights must add up to 100% */
+	if (tc_ets_weight != 100) {
+		netif_warn(priv, drv, netdev,
+			   "sum of ETS algorithm weights is not 100 (%u)\n",
+			   tc_ets_weight);
+		return -EINVAL;
+	}
+
+	priv->num_tcs = max_tc + 1;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		netif_dbg(priv, drv, netdev,
+			  "TC%u: tx_bw=%#x, rx_bw=%#x, tsa=%#x\n", i,
+			  ets->tc_tx_bw[i], ets->tc_rx_bw[i], ets->tc_tsa[i]);
+		netif_dbg(priv, drv, netdev, "PRIO%u: TC=%#x\n", i,
+			  ets->prio_tc[i]);
+	}
+	netif_dbg(priv, drv, netdev, "(%s) set ets\n", __func__);
+	dn200_config_dcb_tc(priv);
+
+	return 0;
+}
+
+static bool dn200_is_pfc_queue(struct dn200_priv *priv, unsigned int queue)
+{
+	unsigned int prio, tc;
+
+	for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++) {
+		/* Does this queue handle the priority? */
+		if (priv->prio2q_map[prio] != queue)
+			continue;
+
+		/* Get the Traffic Class for this priority */
+		tc = priv->ets->prio_tc[prio];
+
+		/* Check if PFC is enabled for this traffic class */
+		if (priv->pfc->pfc_en & (1 << tc))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned int dn200_get_pfc_queues(struct dn200_priv *priv)
+{
+	unsigned int count, prio_queues;
+	unsigned int i;
+
+	if (!priv->pfc->pfc_en)
+		return 0;
+
+	count = 0;
+	prio_queues = DN200_PRIO_QUEUES(priv->plat->rx_queues_to_use);
+	for (i = 0; i < prio_queues; i++) {
+		if (!dn200_is_pfc_queue(priv, i))
+			continue;
+
+		priv->pfcq[i] = 1;
+		count++;
+	}
+
+	return count;
+}
+
+static void dn200_config_rx_fifo_size(struct dn200_priv *priv)
+{
+	unsigned int fifo[MTL_MAX_RX_QUEUES];
+	unsigned int pfc_queues;
+	unsigned int queue;
+	u32 value;
+
+	/* Clear any DCB related fifo/queue information */
+	memset(priv->pfcq, 0, sizeof(priv->pfcq));
+
+	pfc_queues = dn200_get_pfc_queues(priv);
+
+	/**
+	 * Assign equal Rx FIFO size and set flow control
+	 * RFA/RFD in dn200_dma_operation_mode(). If FIFO
+	 * size is equal or more than 4K, EHFC(Enable HW
+	 * Flow Control) will be set.
+	 */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
+		value = readl(priv->ioaddr + XGMAC_MTL_RXQ_OPMODE(queue));
+		fifo[queue] = (value & XGMAC_RQS) >> XGMAC_RQS_SHIFT;
+		netif_dbg(priv, drv, priv->dev, "RxQ%u, %u byte fifo queue\n",
+			  queue, ((fifo[queue] + 1) * DN200_FIFO_UNIT));
+	}
+
+	/**
+	 * There are only total 64K FIFO due to Hardware limit.
+	 * In the case of 8 queues enabled, each queue can get
+	 * 8KB fifo on average, which means it maybe unnecessary
+	 * to set RFA and RFD in the unit of 0.5KB.
+	 */
+}
+
+static void dn200_config_flow_control(struct dn200_priv *priv)
+{
+	struct ieee_pfc *pfc = priv->pfc;
+	u32 value;
+
+	if (pfc && pfc->pfc_en) {
+		/* Set RFE for Rx flow control and TFE/PT for Tx */
+		dn200_flow_ctrl(priv, priv->hw, DUPLEX_FULL, FLOW_AUTO,
+				PAUSE_TIME, priv->plat->tx_queues_to_use);
+
+		/* Enable PFC instead of Pause Frame */
+		value = readl(priv->ioaddr + XGMAC_RX_FLOW_CTRL);
+		value |= XGMAC_PFCE;
+		writel(value, priv->ioaddr + XGMAC_RX_FLOW_CTRL);
+	} else {
+		/* Restore to the original pause configuration */
+		dn200_flow_ctrl(priv, priv->hw, priv->duplex, priv->flow_ctrl,
+				priv->pause, priv->plat->tx_queues_to_use);
+
+		/* Disable PFC mode */
+		value = readl(priv->ioaddr + XGMAC_RX_FLOW_CTRL);
+		value &= ~XGMAC_PFCE;
+		writel(value, priv->ioaddr + XGMAC_RX_FLOW_CTRL);
+	}
+}
+
+static void dn200_prepare_rx_stop(struct dn200_priv *priv, u32 queue)
+{
+	u32 rx_status;
+	unsigned long rx_timeout;
+
+	/* The Rx engine cannot be stopped if it is actively processing
+	 * packets. Wait for the Rx queue to empty the Rx fifo.  Don't
+	 * wait forever though...
+	 */
+	rx_timeout = jiffies + (DN200_DMA_STOP_TIMEOUT * HZ);
+	while (time_before(jiffies, rx_timeout)) {
+		rx_status = readl(priv->ioaddr + XGMAC_MTL_RXQ_DEBUG(queue));
+		if (((rx_status & XGMAC_PRXQ) == 0)
+		    && ((rx_status & XGMAC_RXQSTS) == 0))
+			break;
+		usleep_range(500, 1000);
+	}
+
+	if (!time_before(jiffies, rx_timeout))
+		netdev_info(priv->dev,
+			    "timed out waiting for Rx queue %u to empty\n",
+			    queue);
+}
+
+static void dn200_enable_rx(struct dn200_priv *priv)
+{
+	u32 value, queue;
+	u8 mode;
+
+	/* Enable each Rx DMA channel */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++)
+		dn200_start_rx(priv, priv->ioaddr, queue, priv->hw);
+
+	/* Enable each Rx queue */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
+		mode = priv->plat->rx_queues_cfg[queue].mode_to_use;
+		dn200_rx_queue_enable(priv, priv->hw, mode, queue);
+	}
+
+	if (!PRIV_IS_VF(priv)) {
+		/* broadcast & mutlicast put to last mtl queue and copy to all dma channels */
+		queue = DN200_LAST_QUEUE(priv);
+		dn200_rx_queue_enable(priv, priv->hw, MTL_QUEUE_DCB, queue);
+	}
+
+	/* Enable MAC Rx */
+	value = readl(priv->ioaddr + XGMAC_RX_CONFIG);
+	value |= XGMAC_CONFIG_RE;
+	writel(value, priv->ioaddr + XGMAC_RX_CONFIG);
+}
+
+static void dn200_disable_rx(struct dn200_priv *priv)
+{
+	u32 value, queue;
+
+	/* Disable MAC Rx */
+	value = readl(priv->ioaddr + XGMAC_RX_CONFIG);
+	value &= ~XGMAC_CONFIG_RE;
+	writel(value, priv->ioaddr + XGMAC_RX_CONFIG);
+
+	/* Prepare for Rx DMA channel stop */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++)
+		dn200_prepare_rx_stop(priv, queue);
+
+	/* Disable each Rx queue */
+	writel(0, priv->ioaddr + XGMAC_RXQ_CTRL0);
+
+	/* Disable each Rx DMA channel */
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++)
+		dn200_stop_rx(priv, priv->ioaddr, queue, priv->hw);
+}
+
+static void dn200_config_dcb_pfc(struct dn200_priv *priv)
+{
+	if (!test_bit(DN200_DCB_DOWN, &priv->state)) {
+		/* Just stop the Tx queues while Rx fifo is changed */
+		netif_tx_stop_all_queues(priv->dev);
+
+		/* Suspend Rx so that fifo's can be adjusted */
+		dn200_disable_rx(priv);
+	}
+
+	dn200_config_rx_fifo_size(priv);
+	dn200_config_flow_control(priv);
+
+	if (!test_bit(DN200_DCB_DOWN, &priv->state)) {
+		/* Resume Rx */
+		dn200_enable_rx(priv);
+
+		/* Resume Tx queues */
+		netif_tx_start_all_queues(priv->dev);
+	}
+}
+
+static int dn200_dcb_ieee_getpfc(struct net_device *netdev,
+				 struct ieee_pfc *pfc)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	/* Set number of supported PFC traffic classes */
+	pfc->pfc_cap = priv->dma_cap.tc_cnt;
+
+	if (priv->pfc) {
+		pfc->pfc_en = priv->pfc->pfc_en;
+		pfc->mbc = priv->pfc->mbc;
+		pfc->delay = priv->pfc->delay;
+	}
+
+	netif_dbg(priv, drv, netdev, "(%s) get pfc\n", __func__);
+	return 0;
+}
+
+static int dn200_dcb_ieee_setpfc(struct net_device *netdev,
+				 struct ieee_pfc *pfc)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	if (!priv->pfc) {
+		priv->pfc = devm_kzalloc(priv->device, sizeof(*priv->pfc),
+					 GFP_KERNEL);
+		if (!priv->pfc)
+			return -ENOMEM;
+	}
+	/* if don't change any pfc settings, don't go on */
+	if (memcmp(priv->pfc, pfc, sizeof(*priv->pfc)) == 0)
+		return 0;
+	memcpy(priv->pfc, pfc, sizeof(*priv->pfc));
+
+	/* Check PFC for supported number of traffic classes */
+	if (pfc->pfc_en & ~((1 << priv->dma_cap.tc_cnt) - 1)) {
+		netif_warn(priv, drv, netdev,
+			   "PFC requested for unsupported traffic class\n");
+		return -EINVAL;
+	}
+
+	netif_dbg(priv, drv, netdev,
+		  "cap=%hhuu, en=%#hhxx, mbc=%hhuu, delay=%hhuu\n", pfc->pfc_cap,
+		  pfc->pfc_en, pfc->mbc, pfc->delay);
+	dn200_config_dcb_pfc(priv);
+
+	netif_dbg(priv, drv, netdev, "(%s) set pfc\n", __func__);
+	return 0;
+}
+
+static int dn200_dcb_ieee_setapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+	struct dcb_ieee_app_dscp_map priority_map;
+	struct dcb_app temp;
+	bool is_new, need_add;
+	int err;
+	u8 up, dscp;
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP) {
+		netif_dbg(priv, drv, netdev, "unsupported selector %d\n",
+			  app->selector);
+		return -EOPNOTSUPP;
+	}
+
+	if (app->protocol >= DN200_TRUST_DSCP ||
+	    app->priority >= DN200_TRUST_UP) {
+		netif_warn(priv, drv, netdev,
+			   "invalid parameters protocol %d priority%d\n",
+			   app->protocol, app->priority);
+		return -EINVAL;
+	}
+
+	dcb_ieee_getapp_dscp_prio_mask_map(netdev, &priority_map);
+	if (priority_map.map[app->protocol]) {
+		is_new = false;
+		/**
+		 * Skip the APP command if new and old mapping are the same,
+		 * and replace the old APP entry if new mapping is different.
+		 */
+		if (priority_map.map[app->protocol] & (1 << app->priority)) {
+			/* New and old mapping are the same, so do nothing */
+			need_add = false;
+			netif_dbg(priv, drv, netdev,
+				  "skip the dscp app command\n");
+		} else {
+			/* Delete the old APP entry if exists */
+			temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+			temp.priority =
+			    ffs(priority_map.map[app->protocol]) - 1;
+			temp.protocol = app->protocol;
+			err = dcb_ieee_delapp(netdev, &temp);
+			if (err)
+				return err;
+
+			need_add = true;
+			netif_dbg(priv, drv, netdev,
+				  "replace the old dscp app entry\n");
+		}
+	} else {
+		/* No dscp mapping entry exists */
+		is_new = true;
+		need_add = true;
+		netif_dbg(priv, drv, netdev, "add the new dscp app entry\n");
+	}
+
+	if (need_add) {
+		err = dcb_ieee_setapp(netdev, app);
+		if (err) {
+			netif_warn(priv, drv, netdev,
+				   "fail to add the dscp app entry\n");
+			return err;
+		}
+	}
+
+	/* Update mapping in private data, default up 0 without dscp entry */
+	dcb_ieee_getapp_dscp_prio_mask_map(netdev, &priority_map);
+	for (dscp = 0; dscp < DN200_TRUST_DSCP; dscp++) {
+		up = ffs(priority_map.map[dscp]);
+		if (up)
+			priv->dscp2up[dscp] = up - 1;
+		else
+			priv->dscp2up[dscp] = 0;
+	}
+
+	if (is_new) {
+		priv->dscp_app_cnt++;
+		netif_dbg(priv, drv, netdev, "dev %d dscp app entry count %d\n",
+			  priv->dev->ifindex, priv->dscp_app_cnt);
+	}
+
+	return 0;
+}
+
+static int dn200_dcb_ieee_delapp(struct net_device *netdev, struct dcb_app *app)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 support = dn200_dcb_getdcbx(netdev);
+	int err;
+
+	if (PRIV_IS_VF(priv) || !(support & DCB_CAP_DCBX_VER_IEEE))
+		return -EOPNOTSUPP;
+
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP) {
+		netif_warn(priv, drv, netdev, "unsupported selector %d\n",
+			   app->selector);
+		return -EOPNOTSUPP;
+	}
+
+	if (app->protocol >= DN200_TRUST_DSCP ||
+	    app->priority >= DN200_TRUST_UP) {
+		netif_warn(priv, drv, netdev,
+			   "invalid parameters protocol %d priority%d\n",
+			   app->protocol, app->priority);
+		return -EINVAL;
+	}
+
+	/* Skip if no dscp app entry */
+	if (!priv->dscp_app_cnt)
+		return -ENOENT;
+
+	/* Delete the dscp app entry */
+	err = dcb_ieee_delapp(netdev, app);
+	if (err) {
+		netif_warn(priv, drv, netdev,
+			   "fail to delete the dscp app entry\n");
+		return err;
+	}
+
+	if (priv->dscp_app_cnt)
+		priv->dscp_app_cnt--;
+	netif_dbg(priv, drv, netdev, "dev %d dscp app entry count %d\n",
+		  priv->dev->ifindex, priv->dscp_app_cnt);
+
+	return 0;
+}
+static u8 dn200_dcb_getstate(struct net_device *netdev)
+{
+	/* DCB mode is on */
+	return 1;
+}
+
+static void dn200_dcb_getpermhwaddr(struct net_device *netdev, u8 *perm_addr)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	unsigned int hi_addr, lo_addr;
+
+	if (!perm_addr)
+		return;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	hi_addr = readl(priv->ioaddr + XGMAC_ADDRX_HIGH(0));
+	lo_addr = readl(priv->ioaddr + XGMAC_ADDRX_LOW(0));
+
+	/* Extract the MAC address from the high and low words */
+	perm_addr[0] = lo_addr & 0xff;
+	perm_addr[1] = (lo_addr >> 8) & 0xff;
+	perm_addr[2] = (lo_addr >> 16) & 0xff;
+	perm_addr[3] = (lo_addr >> 24) & 0xff;
+	perm_addr[4] = hi_addr & 0xff;
+	perm_addr[5] = (hi_addr >> 8) & 0xff;
+}
+
+static void dn200_dcb_getpgtccfgtx(struct net_device *netdev,
+				   int priority, u8 *prio_type,
+				   u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (!priv->ets) {
+		netdev_err(netdev, "%s, ets is not supported\n", __func__);
+		return;
+	}
+
+	if (priority >= 8) {
+		netdev_err(netdev, "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	*prio_type = 0;
+	*bw_pct = 0;
+	*up_map = 0;
+	*pgid = priv->prio2tc_bitmap[priority];
+}
+
+static void dn200_dcb_getpgbwgcfgtx(struct net_device *netdev,
+				    int pgid, u8 *bw_pct)
+{
+	struct ieee_ets ets;
+
+	if (pgid >= 8) {
+		netdev_err(netdev, "%s, priority group(TC) is out of range\n",
+			   __func__);
+		return;
+	}
+
+	dn200_dcb_ieee_getets(netdev, &ets);
+	*bw_pct = ets.tc_tx_bw[pgid];
+}
+
+static int dn200_dcb_get_priority_pfc(struct net_device *netdev,
+				      int priority, u8 *setting)
+{
+	struct ieee_pfc pfc;
+	int err;
+
+	err = dn200_dcb_ieee_getpfc(netdev, &pfc);
+
+	if (err)
+		*setting = 0;
+	else
+		*setting = (pfc.pfc_en >> priority) & 0x01;
+
+	return err;
+}
+
+static void dn200_dcb_getpfccfg(struct net_device *netdev,
+				int priority, u8 *setting)
+{
+	if (priority >= 8) {
+		netdev_err(netdev, "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (!setting)
+		return;
+
+	dn200_dcb_get_priority_pfc(netdev, priority, setting);
+}
+
+static u8 dn200_dcb_getcap(struct net_device *netdev, int capid, u8 *cap)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u8 rval = 0;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 1 << (priv->dma_cap.tc_cnt - 1);
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 << (priv->dma_cap.tc_cnt - 1);
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+		break;
+	default:
+		*cap = 0;
+		rval = 1;
+		break;
+	}
+
+	return rval;
+}
+
+static int dn200_dcb_getnumtcs(struct net_device *netdev, int tcs_id, u8 *num)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = priv->dma_cap.tc_cnt;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u8 dn200_dcb_getpfcstate(struct net_device *netdev)
+{
+	struct ieee_pfc pfc;
+
+	if (dn200_dcb_ieee_getpfc(netdev, &pfc))
+		return 0;
+
+	return pfc.pfc_en ? 1 : 0;
+}
+
+static const struct dcbnl_rtnl_ops dn200_dcbnl_ops = {
+	/* IEEE 802.1Qaz std */
+	.ieee_getets = dn200_dcb_ieee_getets,
+	.ieee_setets = dn200_dcb_ieee_setets,
+	.ieee_getpfc = dn200_dcb_ieee_getpfc,
+	.ieee_setpfc = dn200_dcb_ieee_setpfc,
+	.ieee_setapp = dn200_dcb_ieee_setapp,
+	.ieee_delapp = dn200_dcb_ieee_delapp,
+
+	/* DCBX configuration */
+	.getdcbx = dn200_dcb_getdcbx,
+	.setdcbx = dn200_dcb_setdcbx,
+
+	/* CEE Interfaces only for get_message */
+	.getstate = dn200_dcb_getstate,
+	.getpermhwaddr = dn200_dcb_getpermhwaddr,
+	.getpgtccfgtx = dn200_dcb_getpgtccfgtx,
+	.getpgbwgcfgtx = dn200_dcb_getpgbwgcfgtx,
+	.getpfccfg = dn200_dcb_getpfccfg,
+	.getcap = dn200_dcb_getcap,
+	.getnumtcs = dn200_dcb_getnumtcs,
+	.getpfcstate = dn200_dcb_getpfcstate,
+};
+
+const struct dcbnl_rtnl_ops *dn200_get_dcbnl_ops(void)
+{
+	return &dn200_dcbnl_ops;
+}
+
+void dn200_dcbnl_init(struct dn200_priv *priv, bool init)
+{
+	struct dcb_app temp;
+	struct dcb_ieee_app_dscp_map priority_map;
+	int i, j, err, up = 0;
+
+	priv->dscp_app_cnt = 0;
+
+	/* Init when dev open */
+	if (init) {
+		netdev_dbg(priv->dev,
+				    "Search all existing app dscp entries of dev %d\n",
+				    priv->dev->ifindex);
+		/* Find the priority mapping to the DSCP */
+		dcb_ieee_getapp_dscp_prio_mask_map(priv->dev, &priority_map);
+		for (i = 0; i < DN200_TRUST_DSCP; i++) {
+			/* Find the UP (ffs - 1) mapping to the DSCP */
+			up = ffs(priority_map.map[i]);
+			if (up) {
+				netdev_dbg(priv->dev,
+						    "Find the app dscp entry selector 5 protocol %d priority %d\n",
+						    i, up - 1);
+				priv->dscp_app_cnt++;
+			}
+		}
+		netdev_dbg(priv->dev, "(%s) dev %d dscp app entry count %d\n",
+			   __func__, priv->dev->ifindex, priv->dscp_app_cnt);
+
+		/* Map TC to queue statically */
+		dn200_config_queue_mapping(priv);
+	} else {
+		/* Delete when dn200_dvr_remove() */
+		netdev_dbg(priv->dev,
+				"Clear all existing app dscp entries of dev %d\n",
+				priv->dev->ifindex);
+		/* Clear all existing app dscp entry */
+		temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+		for (i = 0; i < DN200_TRUST_DSCP; i++) {
+			priv->dscp2up[i] = 0;
+			temp.protocol = i;
+			for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) {
+				temp.priority = j;
+				err = dcb_ieee_delapp(priv->dev, &temp);
+				if (!err) {
+					netdev_dbg(priv->dev,
+							"Delete the app dscp entry selector %d protocol %d priority %d\n",
+							temp.selector,
+							temp.protocol,
+							temp.priority);
+				}
+			}
+		}
+	}
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_dcb.h b/drivers/net/ethernet/dapustor/dn200/dn200_dcb.h
new file mode 100644
index 000000000000..824111df7266
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_dcb.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_DCB_H__
+#define __DN200_DCB_H__
+
+#define DN200_TRUST_DSCP			64
+#define DN200_TRUST_UP				8
+
+#include <linux/dcbnl.h>
+#include "dn200.h"
+void dn200_dcbnl_init(struct dn200_priv *priv, bool init);
+const struct dcbnl_rtnl_ops *dn200_get_dcbnl_ops(void);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_eprom.c b/drivers/net/ethernet/dapustor/dn200/dn200_eprom.c
new file mode 100644
index 000000000000..fc57cbecbc69
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_eprom.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Wang Peixiang <peixiang@dapustor.com>
+ *
+ * Update firmware for DN200
+ */
+
+#include <linux/firmware.h>
+#include "dn200.h"
+#include "dn200_cfg.h"
+#include "dn200_eprom.h"
+
+static int dn200_nvme_update_fw_for_others(struct dn200_priv *priv, const char *fw, size_t fw_size)
+{
+	int ret;
+	u32 flag = 0;
+
+	if (priv->plat_ex->vf_flag) {
+		dev_err(priv->device, "[loading fw]stop update fw, please wait vf's operation first.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (PRIV_IS_VF(priv)) {
+		dev_err(priv->device, "[loading fw]stop update fw, please wait vf's operation first.\n");
+		return -EOPNOTSUPP;
+	}
+
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, &flag);
+	if (flag) {
+		dev_warn(priv->device, "[loading fw]please wait update fw finish, then updage fw.\n");
+		return -EOPNOTSUPP;
+	}
+
+	DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 1);
+	ret = dn200_nvme_fw_load(&priv->plat_ex->ctrl, fw, fw_size);
+	if (ret) {
+		dev_err(priv->device, "[loading fw]download fw to controller fail.\n");
+		return ret;
+	}
+
+	ret = dn200_nvme_fw_commit(&priv->plat_ex->ctrl);
+
+	if (ret)
+		dev_err(priv->device, "[loading fw]dn200 load fw fail, sf %#x.\n", ret);
+	else
+		dev_info(priv->device, "[loading fw]dn200 load fw ok.\n");
+
+	DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 0);
+
+	return ret;
+
+}
+
+static int dn200_nvme_update_fw_for_raid(struct dn200_priv *priv, const char *fw, size_t fw_size)
+{
+	int ret;
+	unsigned long in_time_start;
+	u32 flag = 0;
+
+	if (priv->plat_ex->vf_flag) {
+		dev_err(priv->device, "[loading fw]stop update fw, please wait vf's operation first.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (PRIV_IS_VF(priv)) {
+		dev_err(priv->device, "[loading fw]stop update fw, please wait vf's operation first.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (test_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state)) {
+		dev_warn(priv->device, "[loading fw]please wait update fw finish, then updage fw.\n");
+		return -EOPNOTSUPP;
+	}
+
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, &flag);
+	if (flag) {
+		dev_warn(priv->device, "[loading fw]please wait update fw finish, then updage fw.\n");
+		return -EOPNOTSUPP;
+	}
+	DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 1);
+
+	priv->flag_upgrade = true;
+	priv->update_fail = false;
+	set_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+
+	ret = dn200_nvme_fw_load(&priv->plat_ex->ctrl, fw, fw_size);
+	if (ret) {
+		DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 0);
+		dev_err(priv->device, "[loading fw]download fw to controller fail.\n");
+		clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		return ret;
+	}
+
+	ret = dn200_nvme_fw_commit(&priv->plat_ex->ctrl);
+	if (ret) /*for hot upgrade, it must success*/
+		dev_err(priv->device, "[loading fw]dn200 load fw fail, sf %#x.\n", ret);
+	else
+		dev_info(priv->device, "[loading fw]dn200 loading fw, please wait a moment when hot upgrade finish.\n");
+
+	mod_timer(&priv->upgrade_timer, jiffies + msecs_to_jiffies(500)); /*start timer*/
+	in_time_start = jiffies;
+	while (true) {
+		if (!test_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state)) {
+			if (priv->update_fail) {
+				dev_err(priv->device, "fw upgrade fail\n");
+				ret = -EIO;
+			} else {
+				dev_info(priv->device, "fw upgrade success\n");
+			}
+			break;
+		}
+		if (time_after(jiffies, in_time_start + msecs_to_jiffies(40000))) {
+			dev_info(priv->device, "fw upgrade exceed time\n");
+			ret = -EIO;
+			break;
+		}
+		usleep_range(100000, 200000);
+	}
+	priv->flag_upgrade = false;
+	DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 0);
+	return ret;
+}
+
+static int dn200_check_fw_invalid(struct dn200_priv *priv, const char *fw, size_t fw_size)
+{
+	struct header_file_t *head;
+
+	if (fw_size < sizeof(struct header_file_t) || !fw)
+		return -EACCES;
+
+	head = (struct header_file_t *)fw;
+
+	if ((head->magic_num != HEADER_FILE_MAGIC || head->plat_id != 3) ||
+		(priv->plat_ex->pdev->device == DN200_DEV_ID_SFP_10G_2P_SRIOV_PF &&
+			!(head->board_id & (1 << board_type_xgmac_only))) ||
+		(priv->plat_ex->pdev->device == DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF &&
+			!(head->board_id & (1 << board_type_gmac_combo))) ||
+		(priv->plat_ex->pdev->device == DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_PF &&
+			!(head->board_id & (1 << board_type_xgmac_combo)))) {
+		dev_err(priv->device, "[loading fw]the firmware is invalid.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int dn200_load_firmware(struct net_device *netdev, struct ethtool_flash *efl)
+{
+	const struct firmware *fw;
+	int status;
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	status = request_firmware(&fw, efl->data, priv->device);
+	if (status) {
+		dev_err(priv->device, "[loading fw]dn200 get fw fail.\n");
+		goto fw_fail;
+	}
+
+	dev_info(priv->device, "[loading fw]dn200 get fw:%s size 0x%lx.\n",
+		 efl->data, fw->size);
+
+	if (dn200_check_fw_invalid(priv, fw->data, fw->size)) {
+		status = -EINVAL;
+		goto fw_fail;
+	}
+
+	if (priv->plat_ex->upgrade_with_flowing) {
+		status = dn200_nvme_update_fw_for_raid(priv, fw->data, fw->size);
+		if (status)
+			status = -EINVAL;
+	} else {
+		status = dn200_nvme_update_fw_for_others(priv, fw->data, fw->size);
+		if (status)
+			status = -EINVAL;
+	}
+
+
+fw_fail:
+	release_firmware(fw);
+	return status;
+}
+
+static int dn200_get_product_info_from_eprom(struct dn200_priv *priv,
+				      struct product_info_t *info)
+{
+	int ret;
+
+	if (!priv || !info)
+		return -EINVAL;
+
+	ret = dn200_nvme_product_info_get(&priv->plat_ex->ctrl, info, sizeof(struct product_info_t));
+	if (ret != 0)
+		return ret;
+
+	if (info->magic_num != PRODUCT_MAGIC_NUMBER) {
+		dev_dbg(priv->device, "product info is invalid[%#x].\n", info->magic_num);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+void dn200_get_mac_from_firmware(struct dn200_priv *priv, struct dn200_resources *res)
+{
+	struct product_info_t info;
+	int ret = dn200_get_product_info_from_eprom(priv, &info);
+
+	if (!ret && priv->plat_ex->funcid < ARRAY_SIZE(info.mac_addr)) {
+		memcpy(res->mac, info.mac_addr[priv->plat_ex->funcid].addr,
+		       ETH_ALEN);
+		return;
+	}
+
+	get_random_bytes(&res->mac, ETH_ALEN);
+	/* Extract the MAC address from the high and low words */
+	res->mac[0] = 0xd8;
+	res->mac[1] = 0xbc;
+	res->mac[2] = 0x59;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_eprom.h b/drivers/net/ethernet/dapustor/dn200/dn200_eprom.h
new file mode 100644
index 000000000000..4a4a162df9af
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_eprom.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Wang Peixiang <peixiang@dapustor.com>
+ *
+ * Update firmware for DN200
+ */
+
+#define __DN200_EPROM_H__
+
+struct mac_addr_t {
+	u8 addr[6];
+} __packed;
+
+union product_ver_t {
+	u64 all;
+	struct {
+		u64 ver:16;
+		u64 minor_ver:8;
+		u64 major_ver:8;
+		u64 prod_class:8;
+		u64 reserved:24;
+	} b;
+};
+
+struct product_info_t {
+	u32 magic_num;
+	struct mac_addr_t mac_addr[6];
+	u8 sn[32];
+	union product_ver_t prod_ver;
+	u8 flag[4];
+	u8 mn[64];
+	u8 mac_bitmap;
+	u8 pn[32];
+} __packed;
+
+#define PRODUCT_MAGIC_NUMBER            (0x5A6C7D8E)
+
+#define FW_PACKAGE_FILE_NAME_SIZE   32
+#define HEADER_FILE_ITEM_COUNT      21
+#define HEADER_FILE_MAGIC			0xAC69CF5D
+
+struct header_file_entry_t {
+	u8 name[FW_PACKAGE_FILE_NAME_SIZE];
+	u32 crc;
+	u32 len;
+	u32 real_len;
+	u8 bmp;
+} __packed;
+
+struct header_file_t {
+	u32 magic_num;
+	u8 count;
+	u8 bmp;
+	u16 preloader_sz;
+	u32 loader_offset[2];
+	u32 loader_sz;
+	u32 body_crc;
+	struct header_file_entry_t entry[HEADER_FILE_ITEM_COUNT];
+	u8 rsv[30];
+	u32 board_id;
+	u8 fwrev[16];
+	u8 plat_id;
+	u32 crc;
+} __packed;
+
+enum _board_type_e {
+	board_type_null,
+	board_type_boot_raid,
+	board_type_xgmac_combo,
+	board_type_xgmac_only,
+	board_type_gmac_combo, //gmac +combo*4
+	board_type_fengshen1,
+	board_type_ram, // for ramdisk test
+};
+
+int dn200_load_firmware(struct net_device *netdev, struct ethtool_flash *efl);
+void dn200_get_mac_from_firmware(struct dn200_priv *priv, struct dn200_resources *res);
+
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_ethtool.c b/drivers/net/ethernet/dapustor/dn200/dn200_ethtool.c
new file mode 100644
index 000000000000..3f4152d54018
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_ethtool.c
@@ -0,0 +1,2170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/interrupt.h>
+#include <linux/mii.h>
+#include <linux/net_tstamp.h>
+#include <linux/io.h>
+#include "dn200.h"
+#include "dwxgmac_comm.h"
+#include "dn200_phy.h"
+#include "dn200_eprom.h"
+
+#define DN200_PRIV_FLAGS_PTP_RXTX	BIT(0)
+#define DN200_PRIV_FLAGS_FEC_EN		BIT(1)
+struct dn200_priv_flags {
+	char flag_string[ETH_GSTRING_LEN];
+	u64 flag;
+	bool read_only;
+};
+
+#define DN200_PRIV_FLAG(_name, _flag, _read_only) { \
+	.flag_string = _name, \
+	.flag = _flag, \
+	.read_only = _read_only, \
+}
+static const struct dn200_priv_flags dn200_gstrings_priv_flags[] = {
+	DN200_PRIV_FLAG("ptp-rxtx", DN200_PRIV_FLAGS_PTP_RXTX, 0),
+	DN200_PRIV_FLAG("fec-en", DN200_PRIV_FLAGS_FEC_EN, 0),
+};
+
+#define DN200_PRIV_FLAGS_STR_LEN ARRAY_SIZE(dn200_gstrings_priv_flags)
+
+#define REG_SPACE_SIZE	0x1060
+#define GMAC4_REG_SPACE_SIZE	0x116C
+#define GEN_PHY_REG_SPACE_SIZE	0x100
+#define MAC100_ETHTOOL_NAME	"st_mac100"
+#define GMAC_ETHTOOL_NAME	"st_gmac"
+#define XGMAC_ETHTOOL_NAME	"dn200"
+
+/* Same as DMA_CHAN_BASE_ADDR defined in dwmac4_dma.h
+ *
+ * It is here because dwmac_dma.h and dwmac4_dam.h can not be included at the
+ * same time due to the conflicting macro names.
+ */
+#define GMAC4_DMA_CHAN_BASE_ADDR  0x00001100
+
+#define ETHTOOL_DMA_OFFSET	55
+
+#define MMC_DISPLAY_FLAG 1
+struct dn200_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define DN200_STAT(name, m)	\
+	{ name, sizeof_field(struct dn200_extra_stats, m),	\
+	offsetof(struct dn200_priv, xstats.m)}
+
+static const struct dn200_stats dn200_gstrings_stats[] = {
+	DN200_STAT("rx_csum_err", rx_csum_err),
+	/* Tx/Rx IRQ error info */
+	DN200_STAT("tx_process_stopped_irq", tx_process_stopped_irq),
+	DN200_STAT("rx_buf_unav_irq", rx_buf_unav_irq),
+	DN200_STAT("fatal_bus_error_irq", fatal_bus_error_irq),
+
+	/* Tx/Rx IRQ Events */
+	DN200_STAT("tx_pkt_n", tx_pkt_n),
+	DN200_STAT("rx_pkt_n", rx_pkt_n),
+	DN200_STAT("normal_irq_n", normal_irq_n),
+	DN200_STAT("rx_normal_irq_n", rx_normal_irq_n),
+	DN200_STAT("napi_poll", napi_poll),
+	DN200_STAT("tx_normal_irq_n", tx_normal_irq_n),
+	DN200_STAT("tx_clean", tx_clean),
+	DN200_STAT("tx_set_ic_bit", tx_set_ic_bit),
+	DN200_STAT("irq_receive_pmt_irq_n", irq_receive_pmt_irq_n),
+
+	/* EEE */
+	DN200_STAT("irq_tx_path_in_lpi_mode_n", irq_tx_path_in_lpi_mode_n),
+	DN200_STAT("irq_tx_path_exit_lpi_mode_n", irq_tx_path_exit_lpi_mode_n),
+	DN200_STAT("irq_rx_path_in_lpi_mode_n", irq_rx_path_in_lpi_mode_n),
+	DN200_STAT("irq_rx_path_exit_lpi_mode_n", irq_rx_path_exit_lpi_mode_n),
+	DN200_STAT("phy_eee_wakeup_error_n", phy_eee_wakeup_error_n),
+
+	/* TSO */
+	DN200_STAT("tx_tso_frames", tx_tso_frames),
+	DN200_STAT("tx_tso_nfrags", tx_tso_nfrags),
+
+	/* PF/VF reset stats */
+	DN200_STAT("rst_start_count", rst_start_count),
+	DN200_STAT("rst_finish_count", rst_finish_count),
+	DN200_STAT("rst_start_ok_count", rst_start_ok_count),
+	DN200_STAT("rst_finish_ok_count", rst_finish_ok_count),
+	DN200_STAT("rst_start_accept_count", rst_start_accept_count),
+	DN200_STAT("rst_finish_accept_count", rst_finish_accept_count),
+	DN200_STAT("normal_rst_count", normal_rst_count),
+	DN200_STAT("tx_timeout_rst_count", tx_timeout_rst_count),
+	DN200_STAT("dma_chan_err_rst_count", dma_chan_err_rst_count),
+
+	DN200_STAT("tx_frames_129_to_256", tx_frames_129_to_256),
+	DN200_STAT("tx_frames_65_to_128", tx_frames_65_to_128),
+	DN200_STAT("tx_frames_33_to_64", tx_frames_33_to_64),
+	DN200_STAT("tx_frames_17_to_32", tx_frames_17_to_32),
+	DN200_STAT("tx_frames_16_below", tx_frames_16_below),
+};
+
+#define DN200_STATS_LEN ARRAY_SIZE(dn200_gstrings_stats)
+
+/* HW MAC Management counters (if supported) */
+#define DN200_MMC_STAT(name, m)	\
+	{ name, sizeof_field(struct dn200_counters, m),	\
+	offsetof(struct dn200_priv, mmc.m)}
+
+/* SW MAC Management counters (if supported) */
+#define DN200_SWC_STAT(name, m)       \
+	{ name, sizeof_field(struct dn200_swcounters, m),      \
+	offsetof(struct dn200_priv, swc.m)}
+
+static const struct dn200_stats dn200_swc[] = {
+	DN200_SWC_STAT("port.rx_vlan_strip", mmc_rx_vlan_strip),
+	DN200_SWC_STAT("port.rx_fd_drop", mmc_rx_fd_drop),
+	DN200_SWC_STAT("port.tx_vlan_insert", mmc_tx_vlan_insert),
+	DN200_SWC_STAT("port.tx_mem_copy", tx_mem_copy),
+	DN200_SWC_STAT("port.rx_mem_copy", rx_mem_copy),
+	DN200_SWC_STAT("port.tx_iatu_hw_updt_cnt", tx_iatu_updt_cnt),
+	DN200_SWC_STAT("port.tx_iatu_match_cnt", tx_iatu_match_cnt),
+	DN200_SWC_STAT("port.tx_iatu_find_cnt", tx_iatu_find_cnt),
+	DN200_SWC_STAT("port.tx_iatu_hw_recyc_cnt", tx_iatu_recyc_cnt),
+	DN200_SWC_STAT("port.hw_lock_fail_cnt", hw_lock_fail_cnt),
+	DN200_SWC_STAT("port.hw_lock_timeout", hw_lock_timeout),
+	DN200_SWC_STAT("port.hw_lock_timeout", hw_lock_recfgs),
+};
+
+static const struct dn200_stats dn200_mmc[] = {
+	DN200_MMC_STAT("port.tx_octetcount_gb", mmc_tx_octetcount_gb),
+	DN200_MMC_STAT("port.tx_framecount_gb", mmc_tx_framecount_gb),
+	DN200_MMC_STAT("port.tx_broadcastframe_g", mmc_tx_broadcastframe_g),
+	DN200_MMC_STAT("port.tx_multicastframe_g", mmc_tx_multicastframe_g),
+	DN200_MMC_STAT("port.tx_64_octets_gb", mmc_tx_64_octets_gb),
+	DN200_MMC_STAT("port.tx_65_to_127_octets_gb",
+		       mmc_tx_65_to_127_octets_gb),
+	DN200_MMC_STAT("port.tx_128_to_255_octets_gb",
+		       mmc_tx_128_to_255_octets_gb),
+	DN200_MMC_STAT("port.tx_256_to_511_octets_gb",
+		       mmc_tx_256_to_511_octets_gb),
+	DN200_MMC_STAT("port.tx_512_to_1023_octets_gb",
+		       mmc_tx_512_to_1023_octets_gb),
+	DN200_MMC_STAT("port.tx_1024_to_max_octets_gb",
+		       mmc_tx_1024_to_max_octets_gb),
+	DN200_MMC_STAT("port.tx_unicast_gb", mmc_tx_unicast_gb),
+	DN200_MMC_STAT("port.tx_multicast_gb", mmc_tx_multicast_gb),
+	DN200_MMC_STAT("port.tx_broadcast_gb", mmc_tx_broadcast_gb),
+	DN200_MMC_STAT("port.tx_underflow_error", mmc_tx_underflow_error),
+	DN200_MMC_STAT("port.tx_octetcount_g", mmc_tx_octetcount_g),
+	DN200_MMC_STAT("port.tx_framecount_g", mmc_tx_framecount_g),
+	DN200_MMC_STAT("port.tx_pause_frame", mmc_tx_pause_frame),
+	DN200_MMC_STAT("port.tx_vlan_frame_g", mmc_tx_vlan_frame_g),
+	DN200_MMC_STAT("port.tx_lpi_usec", mmc_tx_lpi_usec),
+	DN200_MMC_STAT("port.tx_lpi_tran", mmc_tx_lpi_tran),
+	DN200_MMC_STAT("port.rx_framecount_gb", mmc_rx_framecount_gb),
+	DN200_MMC_STAT("port.rx_octetcount_gb", mmc_rx_octetcount_gb),
+	DN200_MMC_STAT("port.rx_octetcount_g", mmc_rx_octetcount_g),
+	DN200_MMC_STAT("port.rx_broadcastframe_g", mmc_rx_broadcastframe_g),
+	DN200_MMC_STAT("port.rx_multicastframe_g", mmc_rx_multicastframe_g),
+	DN200_MMC_STAT("port.rx_crc_error", mmc_rx_crc_error),
+	DN200_MMC_STAT("port.rx_run_error", mmc_rx_run_error),
+	DN200_MMC_STAT("port.rx_jabber_error", mmc_rx_jabber_error),
+	DN200_MMC_STAT("port.rx_undersize_g", mmc_rx_undersize_g),
+	DN200_MMC_STAT("port.rx_oversize_g", mmc_rx_oversize_g),
+	DN200_MMC_STAT("port.rx_64_octets_gb", mmc_rx_64_octets_gb),
+	DN200_MMC_STAT("port.rx_65_to_127_octets_gb",
+		       mmc_rx_65_to_127_octets_gb),
+	DN200_MMC_STAT("port.rx_128_to_255_octets_gb",
+		       mmc_rx_128_to_255_octets_gb),
+	DN200_MMC_STAT("port.rx_256_to_511_octets_gb",
+		       mmc_rx_256_to_511_octets_gb),
+	DN200_MMC_STAT("port.rx_512_to_1023_octets_gb",
+		       mmc_rx_512_to_1023_octets_gb),
+	DN200_MMC_STAT("port.rx_1024_to_max_octets_gb",
+		       mmc_rx_1024_to_max_octets_gb),
+	DN200_MMC_STAT("port.rx_unicast_g", mmc_rx_unicast_g),
+	DN200_MMC_STAT("port.rx_length_error", mmc_rx_length_error),
+	DN200_MMC_STAT("port.rx_outofrangetype", mmc_rx_outofrangetype),
+	DN200_MMC_STAT("port.rx_pause_frames", mmc_rx_pause_frames),
+	DN200_MMC_STAT("port.rx_fifo_overflow", mmc_rx_fifo_overflow),
+	DN200_MMC_STAT("port.rx_vlan_frames_gb", mmc_rx_vlan_frames_gb),
+	DN200_MMC_STAT("port.rx_watchdog_error", mmc_rx_watchdog_error),
+	DN200_MMC_STAT("port.rx_lpi_usec", mmc_rx_lpi_usec),
+	DN200_MMC_STAT("port.rx_lpi_tran", mmc_rx_lpi_tran),
+	DN200_MMC_STAT("port.rx_discard_pkt_gb", mmc_rx_discard_pkt_gb),
+	DN200_MMC_STAT("port.rx_discard_oct_gb", mmc_rx_discard_oct_gb),
+	DN200_MMC_STAT("port.rx_align_err", mmc_rx_align_err),
+	DN200_MMC_STAT("port.rx_ipc_intr_mask", mmc_rx_ipc_intr_mask),
+	DN200_MMC_STAT("port.rx_ipc_intr", mmc_rx_ipc_intr),
+	DN200_MMC_STAT("port.rx_ipv4_gd", mmc_rx_ipv4_gd),
+	DN200_MMC_STAT("port.rx_ipv4_hderr", mmc_rx_ipv4_hderr),
+	DN200_MMC_STAT("port.rx_ipv4_nopay", mmc_rx_ipv4_nopay),
+	DN200_MMC_STAT("port.rx_ipv4_frag", mmc_rx_ipv4_frag),
+	DN200_MMC_STAT("port.rx_ipv4_udsbl", mmc_rx_ipv4_udsbl),
+	DN200_MMC_STAT("port.rx_ipv4_gd_octets", mmc_rx_ipv4_gd_octets),
+	DN200_MMC_STAT("port.rx_ipv4_hderr_octets", mmc_rx_ipv4_hderr_octets),
+	DN200_MMC_STAT("port.rx_ipv4_nopay_octets", mmc_rx_ipv4_nopay_octets),
+	DN200_MMC_STAT("port.rx_ipv4_frag_octets", mmc_rx_ipv4_frag_octets),
+	DN200_MMC_STAT("port.rx_ipv4_udsbl_octets", mmc_rx_ipv4_udsbl_octets),
+	DN200_MMC_STAT("port.rx_ipv6_gd_octets", mmc_rx_ipv6_gd_octets),
+	DN200_MMC_STAT("port.rx_ipv6_hderr_octets", mmc_rx_ipv6_hderr_octets),
+	DN200_MMC_STAT("port.rx_ipv6_nopay_octets", mmc_rx_ipv6_nopay_octets),
+	DN200_MMC_STAT("port.rx_ipv6_gd", mmc_rx_ipv6_gd),
+	DN200_MMC_STAT("port.rx_ipv6_hderr", mmc_rx_ipv6_hderr),
+	DN200_MMC_STAT("port.rx_ipv6_nopay", mmc_rx_ipv6_nopay),
+	DN200_MMC_STAT("port.rx_udp_gd", mmc_rx_udp_gd),
+	DN200_MMC_STAT("port.rx_udp_err", mmc_rx_udp_err),
+	DN200_MMC_STAT("port.rx_udp_gd_octets", mmc_rx_udp_gd_octets),
+	DN200_MMC_STAT("port.rx_udp_err_octets", mmc_rx_udp_err_octets),
+	DN200_MMC_STAT("port.rx_tcp_gd", mmc_rx_tcp_gd),
+	DN200_MMC_STAT("port.rx_tcp_err", mmc_rx_tcp_err),
+	DN200_MMC_STAT("port.rx_tcp_gd_octets", mmc_rx_tcp_gd_octets),
+	DN200_MMC_STAT("port.rx_tcp_err_octets", mmc_rx_tcp_err_octets),
+	DN200_MMC_STAT("port.rx_icmp_gd", mmc_rx_icmp_gd),
+	DN200_MMC_STAT("port.rx_icmp_err", mmc_rx_icmp_err),
+	DN200_MMC_STAT("port.rx_icmp_gd_octets", mmc_rx_icmp_gd_octets),
+	DN200_MMC_STAT("port.rx_icmp_err_octets", mmc_rx_icmp_err_octets),
+	DN200_MMC_STAT("port.tx_fpe_fragment_cntr", mmc_tx_fpe_fragment_cntr),
+	DN200_MMC_STAT("port.tx_hold_req_cntr", mmc_tx_hold_req_cntr),
+	DN200_MMC_STAT("port.rx_packet_assembly_err_cntr",
+				mmc_rx_packet_assembly_err_cntr),
+	DN200_MMC_STAT("port.rx_packet_assembly_ok_cntr",
+				mmc_rx_packet_assembly_ok_cntr),
+	DN200_MMC_STAT("port.rx_fpe_fragment_cntr", mmc_rx_fpe_fragment_cntr),
+};
+
+#define DN200_MMC_STATS_LEN ARRAY_SIZE(dn200_mmc)
+#define DN200_SWC_STATS_LEN ARRAY_SIZE(dn200_swc)
+
+static const char dn200_qstats_flag_string[][ETH_GSTRING_LEN] = {
+	"mmc_all_function",
+#define DN200_FLAG_STATS ARRAY_SIZE(dn200_qstats_flag_string)
+};
+
+static const char dn200_qstats_tx_string[][ETH_GSTRING_LEN] = {
+	"tx_pkt_n",
+#define DN200_TXQ_STATS ARRAY_SIZE(dn200_qstats_tx_string)
+};
+
+static const char dn200_qstats_rx_string[][ETH_GSTRING_LEN] = {
+	"rx_pkt_n",
+#define DN200_RXQ_STATS ARRAY_SIZE(dn200_qstats_rx_string)
+};
+
+static void dn200_ethtool_getdrvinfo(struct net_device *dev,
+				     struct ethtool_drvinfo *info)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret;
+	char str_ver[33] = "";
+	struct dn200_ver dn200_ver;
+
+	if (priv->plat->has_gmac || priv->plat->has_gmac4)
+		strscpy(info->driver, GMAC_ETHTOOL_NAME, sizeof(info->driver));
+	else if (priv->plat->has_xgmac)
+		strscpy(info->driver, XGMAC_ETHTOOL_NAME, sizeof(info->driver));
+	else
+		strscpy(info->driver, MAC100_ETHTOOL_NAME,
+			sizeof(info->driver));
+
+	if (priv->plat_ex->pdev) {
+		strscpy(info->bus_info, pci_name(priv->plat_ex->pdev),
+			sizeof(info->bus_info));
+	}
+	strscpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+
+	if (!priv->plat_ex->nvme_supported) {
+		ret = dn200_ctrl_ccena(priv->plat_ex->pdev, 0, 1, false);
+		if (ret) {
+			dev_err(&priv->plat_ex->pdev->dev,
+				"func %s, line %d: ctrl cc enable timeout\n",
+				__func__, __LINE__);
+		}
+	}
+	if (PRIV_IS_VF(priv)) {
+		dn200_sriov_ver_get(priv, &dn200_ver);
+	} else {
+		dn200_get_fw_ver(&priv->plat_ex->ctrl, &dn200_ver);
+		if (PRIV_SRIOV_SUPPORT(priv))
+			dn200_sriov_ver_set(priv, &dn200_ver);
+	}
+
+	sprintf(str_ver, "%c%c%c%c%c%c%c%c", dn200_ver.type, dn200_ver.product_type,
+			 dn200_ver.rsv, dn200_ver.is_fw, dn200_ver.publish, dn200_ver.number0,
+			 dn200_ver.number1, dn200_ver.number2);
+	strscpy(info->fw_version, str_ver, sizeof(info->fw_version));
+}
+
+static int dn200_ethtool_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	return PRIV_PHY_OPS(priv)->get_link_ksettings(dev, cmd);
+}
+
+static int
+dn200_ethtool_set_link_ksettings(struct net_device *dev,
+				 const struct ethtool_link_ksettings *cmd)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+	return PRIV_PHY_OPS(priv)->set_link_ksettings(dev, cmd);
+}
+
+static u32 dn200_ethtool_getmsglevel(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	return priv->msg_enable;
+}
+
+static void dn200_ethtool_setmsglevel(struct net_device *dev, u32 level)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	priv->msg_enable = level;
+}
+
+static int dn200_ethtool_get_regs_len(struct net_device *dev)
+{
+	int len;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (priv->plat->has_xgmac)
+		len = XGMAC_REGSIZE * 4;
+	else
+		len = REG_SPACE_SIZE;
+
+	if (priv->mii) {
+		len = ALIGN(len, 16);
+		len += GEN_PHY_REG_SPACE_SIZE;
+	}
+
+	return len;
+}
+
+static void dn200_dump_phy_regs(struct dn200_priv *priv, u32 *reg_space)
+{
+	int offset;
+	int len;
+	int i = 0;
+	int addr = priv->plat->phy_addr;
+	struct phy_device *phydev;
+	u32 ext_reg[] = {0xa0, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+					 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1,
+					 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xa003};
+
+	if (!priv->mii)
+		return;
+
+	len = dn200_ethtool_get_regs_len(priv->dev);
+
+	for (offset = len - GEN_PHY_REG_SPACE_SIZE; offset < len - GEN_PHY_REG_SPACE_SIZE + 32 * 4;
+	     offset += 4, i++) {
+		reg_space[offset >> 2] =
+		    mdiobus_read(priv->mii, priv->plat->phy_addr, i);
+	}
+
+	phydev = mdiobus_get_phy(priv->mii, addr);
+	if (!phydev)
+		return;
+
+	offset = len - GEN_PHY_REG_SPACE_SIZE + 32 * 4;
+	for (i = 0; i < ARRAY_SIZE(ext_reg); i++) {
+		reg_space[offset >> 2] =
+		    ytphy_read_ext(phydev, ext_reg[i]) | (ext_reg[i] << 16);
+		offset += 4;
+	}
+}
+
+static void dn200_ethtool_gregs(struct net_device *dev,
+				struct ethtool_regs *regs, void *space)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 *reg_space = (u32 *) space;
+
+	dn200_dump_mac_regs(priv, priv->hw, reg_space);
+	dn200_dump_dma_regs(priv, priv->ioaddr, reg_space);
+	dn200_dump_phy_regs(priv, reg_space);
+}
+
+static int dn200_nway_reset(struct net_device *dev)
+{
+	int retry = 0;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EBUSY;
+
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+
+	if (priv->mii) {
+		while (test_and_set_bit(DN200_RESETING, &priv->state)) {
+			usleep_range(1000, 2000);
+			if (retry++ >= 3)
+				return -EBUSY;
+		}
+
+		if (netif_running(dev)) {
+			dev_close(dev);
+
+			dev_open(dev, NULL);
+			dev->netdev_ops->ndo_set_rx_mode(dev);
+		}
+		clear_bit(DN200_RESETING, &priv->state);
+		return 0;
+	} else
+		return PRIV_PHY_OPS(priv)->nway_reset(PRIV_PHY_INFO(priv));
+}
+
+static void
+dn200_get_ringparam(struct net_device *netdev,
+		    struct ethtool_ringparam *ring,
+		    struct kernel_ethtool_ringparam __always_unused *ker,
+		    struct netlink_ext_ack __always_unused *extack)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	ring->rx_max_pending = DMA_MAX_RX_SIZE;
+	ring->tx_max_pending = DMA_MAX_TX_SIZE;
+	ring->rx_pending = priv->dma_rx_size;
+	ring->tx_pending = priv->dma_tx_size;
+}
+
+static int
+dn200_set_ringparam(struct net_device *netdev,
+		    struct ethtool_ringparam *ring,
+		    struct kernel_ethtool_ringparam __always_unused *ker,
+		    struct netlink_ext_ack __always_unused *extack)
+{
+	if (ring->rx_mini_pending || ring->rx_jumbo_pending ||
+	    ring->rx_pending < DMA_MIN_RX_SIZE ||
+	    ring->rx_pending > DMA_MAX_RX_SIZE ||
+	    ring->tx_pending < DMA_MIN_TX_SIZE ||
+	    ring->tx_pending > DMA_MAX_TX_SIZE)
+		return -EINVAL;
+	return dn200_reinit_ringparam(netdev, ring->rx_pending,
+				      ring->tx_pending);
+}
+
+static void
+dn200_get_pauseparam(struct net_device *netdev,
+		     struct ethtool_pauseparam *pause)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	PRIV_PHY_OPS(priv)->get_phy_pauseparam(PRIV_PHY_INFO(priv), pause);
+}
+
+static int
+dn200_set_pauseparam(struct net_device *netdev,
+		     struct ethtool_pauseparam *pause)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct ieee_pfc *pfc = priv->pfc;
+
+	if (PRIV_IS_VF(priv))
+		return -EOPNOTSUPP;
+
+	/*pfc feature also based on pause hw enable bit */
+	if (pfc && pfc->pfc_en) {
+		netdev_info(netdev,
+			    "Priority flow control is enabled. Cannot set link flow control.\n");
+		return -EOPNOTSUPP;
+	}
+
+	return PRIV_PHY_OPS(priv)->set_phy_pauseparam(PRIV_PHY_INFO(priv),
+						      pause);
+}
+
+static void dn200_get_per_qstats(struct dn200_priv *priv, u64 *data,
+				 int *count)
+{
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	int q, stat;
+	char *p;
+
+	for (q = 0; q < tx_cnt; q++) {
+		p = (char *)priv + offsetof(struct dn200_priv,
+					    xstats.txq_stats[q].tx_pkt_n);
+		for (stat = 0; stat < DN200_TXQ_STATS; stat++) {
+			*data++ = (*(u64 *) p);
+			p += sizeof(u64 *);
+			*count = *count + 1;
+		}
+	}
+	for (q = 0; q < rx_cnt; q++) {
+		p = (char *)priv + offsetof(struct dn200_priv,
+					    xstats.rxq_stats[q].rx_pkt_n);
+		for (stat = 0; stat < DN200_RXQ_STATS; stat++) {
+			*data++ = (*(u64 *) p);
+			p += sizeof(u64 *);
+			*count = *count + 1;
+		}
+	}
+}
+
+static void dn200_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *dummy, u64 *data)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int count = 0;
+	int i, j = 0;
+	char *p;
+
+	for (i = 0; i < DN200_STATS_LEN; i++) {
+		char *p = (char *)priv + dn200_gstrings_stats[i].stat_offset;
+
+		data[j++] = (dn200_gstrings_stats[i].sizeof_stat ==
+			     sizeof(u64)) ? (*(u64 *) p) : (*(u32 *) p);
+	}
+	dn200_get_per_qstats(priv, &data[j], &count);
+	j = j + count;
+	if (PRIV_SRIOV_SUPPORT(priv))
+		data[j++] = MMC_DISPLAY_FLAG;
+
+	if (priv->dma_cap.rmon) {
+		if (!test_bit(DN200_SUSPENDED, &priv->state) && !test_bit(DN200_DOWN, &priv->state))
+			dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+
+		for (i = 0; i < DN200_MMC_STATS_LEN; i++) {
+			p = (char *)priv + dn200_mmc[i].stat_offset;
+
+			data[j++] = (dn200_mmc[i].sizeof_stat ==
+				     sizeof(u64)) ? (*(u64 *) p) : (*(u32 *) p);
+		}
+	}
+	for (i = 0; i < DN200_SWC_STATS_LEN; i++) {
+		p = (char *)priv + dn200_swc[i].stat_offset;
+
+		data[j++] = (dn200_swc[i].sizeof_stat ==
+			     sizeof(u64)) ? (*(u64 *) p) : (*(u32 *) p);
+	}
+}
+
+static int dn200_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	int len;
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		len = DN200_STATS_LEN +
+		    DN200_TXQ_STATS * tx_cnt + DN200_RXQ_STATS * rx_cnt;
+		if (PRIV_SRIOV_SUPPORT(priv))
+			len += DN200_FLAG_STATS;
+		if (priv->dma_cap.rmon)
+			len += DN200_MMC_STATS_LEN;
+		len += DN200_SWC_STATS_LEN;
+		return len;
+	case ETH_SS_TEST:
+		if (PRIV_IS_VF(priv))
+			return -EOPNOTSUPP;
+		return dn200_selftest_get_count(priv);
+	case ETH_SS_PRIV_FLAGS:
+		if (PRIV_IS_VF(priv))
+			return -EOPNOTSUPP;
+		return DN200_PRIV_FLAGS_STR_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void dn200_get_qstats_string(struct dn200_priv *priv, u8 *data,
+				    int *count)
+{
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	int q, stat;
+
+	for (q = 0; q < tx_cnt; q++) {
+		for (stat = 0; stat < DN200_TXQ_STATS; stat++) {
+			snprintf(data, ETH_GSTRING_LEN, "q%d_%s", q,
+				 dn200_qstats_tx_string[stat]);
+			data += ETH_GSTRING_LEN;
+			*count += ETH_GSTRING_LEN;
+		}
+	}
+	for (q = 0; q < rx_cnt; q++) {
+		for (stat = 0; stat < DN200_RXQ_STATS; stat++) {
+			snprintf(data, ETH_GSTRING_LEN, "q%d_%s", q,
+				 dn200_qstats_rx_string[stat]);
+			data += ETH_GSTRING_LEN;
+			*count += ETH_GSTRING_LEN;
+		}
+	}
+}
+
+static void dn200_get_priv_flag_strings(struct dn200_priv *priv, u8 *data)
+{
+
+	char *p = (char *)data;
+	unsigned int i;
+
+	if (PRIV_IS_VF(priv))
+		return;
+	for (i = 0; i < DN200_PRIV_FLAGS_STR_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "%s",
+			 dn200_gstrings_priv_flags[i].flag_string);
+		p += ETH_GSTRING_LEN;
+	}
+}
+
+static void dn200_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	int i;
+	u8 *p = data;
+	int count = 0;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < DN200_STATS_LEN; i++) {
+			memcpy(p, dn200_gstrings_stats[i].stat_string,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		dn200_get_qstats_string(priv, p, &count);
+		p += count;
+		if (PRIV_SRIOV_SUPPORT(priv)) {
+			memcpy(p, dn200_qstats_flag_string, ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		if (priv->dma_cap.rmon) {
+			for (i = 0; i < DN200_MMC_STATS_LEN; i++) {
+				memcpy(p, dn200_mmc[i].stat_string,
+				       ETH_GSTRING_LEN);
+				p += ETH_GSTRING_LEN;
+			}
+		}
+		for (i = 0; i < DN200_SWC_STATS_LEN; i++) {
+			memcpy(p, dn200_swc[i].stat_string, ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	case ETH_SS_TEST:
+		if (!PRIV_IS_VF(priv))
+			dn200_selftest_get_strings(priv, p);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		dn200_get_priv_flag_strings(priv, p);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+static int dn200_ethtool_op_get_eee(struct net_device *dev,
+				    struct ethtool_eee *edata)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!priv->dma_cap.eee)
+		return -EOPNOTSUPP;
+
+	edata->eee_enabled = priv->eee_enabled;
+	edata->eee_active = priv->eee_active;
+	edata->tx_lpi_timer = priv->tx_lpi_timer;
+	edata->tx_lpi_enabled = priv->tx_lpi_enabled;
+
+	ret = PRIV_PHY_OPS(priv)->get_eee(PRIV_PHY_INFO(priv), edata);
+	if (priv->mii) {
+		edata->tx_lpi_timer = 0;
+		priv->tx_lpi_enabled = edata->eee_enabled;
+		edata->tx_lpi_enabled = priv->tx_lpi_enabled;
+	}
+
+	return ret;
+}
+
+static int dn200_ethtool_op_set_eee(struct net_device *dev,
+				    struct ethtool_eee *edata)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!priv->dma_cap.eee)
+		return -EOPNOTSUPP;
+
+	if (!priv->mii && netif_running(dev))
+		return -EBUSY;
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+
+	if (priv->mii) {
+		if (priv->tx_lpi_enabled != edata->tx_lpi_enabled) {
+			netdev_err(dev, "Setting EEE tx-lpi is not supported\n");
+			return -EINVAL;
+		}
+		if (edata->tx_lpi_timer) {
+			netdev_err(dev, "Setting EEE Tx LPI timer is not supported\n");
+			return -EINVAL;
+		}
+	}
+
+	if (priv->tx_lpi_enabled != edata->tx_lpi_enabled)
+		netdev_warn(priv->dev, "Setting EEE tx-lpi is not supported\n");
+
+	if (!edata->eee_enabled)
+		dn200_disable_eee_mode(priv);
+
+	ret = PRIV_PHY_OPS(priv)->set_eee(PRIV_PHY_INFO(priv), edata);
+	if (ret)
+		return ret;
+
+	if (edata->eee_enabled && priv->tx_lpi_timer != edata->tx_lpi_timer) {
+		priv->tx_lpi_timer = edata->tx_lpi_timer;
+		dn200_eee_init(priv);
+	}
+
+	return 0;
+}
+
+static int __dn200_get_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec, int queue)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 max_cnt;
+	u32 rx_cnt;
+	u32 tx_cnt;
+
+	rx_cnt = priv->plat->rx_queues_to_use;
+	tx_cnt = priv->plat->tx_queues_to_use;
+	max_cnt = max(rx_cnt, tx_cnt);
+
+	if (queue < 0)
+		queue = 0;
+	else if (queue >= max_cnt)
+		return -EINVAL;
+
+	if (queue < tx_cnt) {
+		ec->tx_coalesce_usecs = priv->tx_coal_timer[queue];
+		ec->tx_max_coalesced_frames = priv->tx_coal_frames_set[queue];
+		if (!!
+		    (priv->tx_intr[queue].itr_setting & DN200_ITR_DYNAMIC_ITR))
+			ec->use_adaptive_tx_coalesce = 1;
+	} else {
+		ec->tx_coalesce_usecs = 0;
+		ec->tx_max_coalesced_frames = 0;
+	}
+
+	if (priv->use_riwt && queue < rx_cnt) {
+		ec->rx_max_coalesced_frames = priv->rx_coal_frames[queue];
+		ec->rx_coalesce_usecs = priv->rx_rius[queue];
+		if (!!
+		    (priv->rx_intr[queue].itr_setting & DN200_ITR_DYNAMIC_ITR))
+			ec->use_adaptive_rx_coalesce = 1;
+	} else {
+		ec->rx_max_coalesced_frames = 0;
+		ec->rx_coalesce_usecs = 0;
+	}
+
+	return 0;
+}
+
+static int dn200_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce __maybe_unused *kec,
+			      struct netlink_ext_ack __maybe_unused *extack)
+{
+	return __dn200_get_coalesce(dev, ec, -1);
+}
+
+static int dn200_get_per_queue_coalesce(struct net_device *dev, u32 queue,
+					struct ethtool_coalesce *ec)
+{
+	return __dn200_get_coalesce(dev, ec, queue);
+}
+
+static int __dn200_set_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec, int queue)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	bool all_queues = false;
+	unsigned int rx_riwt;
+	u32 max_cnt;
+	u32 rx_cnt;
+	u32 tx_cnt;
+	u32 tx_frame;
+	int i;
+
+	rx_cnt = priv->plat->rx_queues_to_use;
+	tx_cnt = priv->plat->tx_queues_to_use;
+	max_cnt = max(rx_cnt, tx_cnt);
+
+	if (queue < 0)
+		all_queues = true;
+	else if (queue >= max_cnt)
+		return -EINVAL;
+
+	if (ec->rx_max_coalesced_frames >=
+	    (priv->dma_rx_size - dn200_rx_refill_size(priv))) {
+		netdev_err(dev,
+			   "rx frams plus 32 need to be less than ring_size (now %d)\n",
+			   priv->dma_rx_size);
+		return -EINVAL;
+	}
+	if (ec->tx_max_coalesced_frames > (priv->dma_tx_size >> 1)) {
+		netdev_err(dev,
+			   "TX frams need to be less than ring_size/2 (now ring_size is %d)\n",
+			   priv->dma_rx_size);
+		return -EINVAL;
+	}
+
+	if ((ec->rx_coalesce_usecs != priv->rx_rius[queue < 0 ? 0 : queue] ||
+		ec->rx_max_coalesced_frames != priv->rx_coal_frames[queue < 0 ? 0 : queue]) &&
+		ec->use_adaptive_rx_coalesce) {
+		netdev_err(dev,
+			"RX interrupt moderation cannot be changed if adaptive-rx is enabled.\n");
+		return -EINVAL;
+	}
+
+	if (ec->rx_coalesce_usecs > DN200_MAX_COAL_RX_TICK)
+		return -EINVAL;
+
+	rx_riwt = dn200_usec2riwt(ec->rx_coalesce_usecs, priv);
+	if (rx_riwt > MAX_DMA_RIWT)
+		rx_riwt = MAX_DMA_RIWT;
+
+	if (rx_riwt < MIN_DMA_RIWT)
+		rx_riwt = MIN_DMA_RIWT;
+
+	if (priv->use_riwt && ec->use_adaptive_rx_coalesce) {
+		if (all_queues) {
+			for (i = 0; i < rx_cnt; i++) {
+				priv->rx_riwt[i] = rx_riwt;
+				priv->rx_rius[i] = ec->rx_coalesce_usecs;
+				priv->rx_intr[i].target_itr =
+				    ec->rx_coalesce_usecs;
+				dn200_rx_watchdog(priv, priv->ioaddr, rx_riwt,
+						  i, priv->hw);
+				priv->rx_intr[i].itr_setting |=
+				    DN200_ITR_DYNAMIC_ITR;
+				priv->rx_coal_frames[i] =
+				    ec->rx_max_coalesced_frames;
+			}
+		} else if (queue < rx_cnt) {
+			priv->rx_riwt[queue] = rx_riwt;
+			priv->rx_rius[queue] = ec->rx_coalesce_usecs;
+			priv->rx_intr[queue].target_itr = ec->rx_coalesce_usecs;
+			priv->rx_coal_frames[queue] =
+			    ec->rx_max_coalesced_frames;
+			priv->rx_intr[queue].itr_setting |=
+			    DN200_ITR_DYNAMIC_ITR;
+			dn200_rx_watchdog(priv, priv->ioaddr, rx_riwt, queue,
+					  priv->hw);
+
+		}
+	} else if (priv->use_riwt) {
+		if (all_queues) {
+			for (i = 0; i < rx_cnt; i++) {
+				priv->rx_riwt[i] = rx_riwt;
+				priv->rx_rius[i] = ec->rx_coalesce_usecs;
+				priv->rx_intr[i].itr_setting &=
+				    ~DN200_ITR_DYNAMIC_ITR;
+				priv->rx_intr[i].target_itr =
+					dn200_riwt2usec(rx_riwt, priv);
+				/* prevent rx_riwt to 0 after exec dn200_usec2riwt */
+				if (priv->rx_intr[i].target_itr < 2)
+					priv->rx_intr[i].target_itr = 2;
+				dn200_rx_watchdog(priv, priv->ioaddr, rx_riwt,
+						  i, priv->hw);
+				priv->rx_coal_frames[i] =
+				    ec->rx_max_coalesced_frames;
+			}
+		} else if (queue < rx_cnt) {
+			priv->rx_riwt[queue] = rx_riwt;
+			priv->rx_rius[queue] = ec->rx_coalesce_usecs;
+			priv->rx_intr[queue].itr_setting &=
+			    ~DN200_ITR_DYNAMIC_ITR;
+			priv->rx_intr[queue].target_itr =
+				dn200_riwt2usec(rx_riwt, priv);
+			/* prevent rx_riwt to 0 after exec dn200_usec2riwt */
+			if (priv->rx_intr[queue].target_itr < 2)
+				priv->rx_intr[queue].target_itr = 2;
+			dn200_rx_watchdog(priv, priv->ioaddr,
+					  rx_riwt, queue, priv->hw);
+			priv->rx_coal_frames[queue] =
+			    ec->rx_max_coalesced_frames;
+		}
+	}
+
+	if ((ec->tx_coalesce_usecs != priv->tx_coal_timer[queue < 0 ? 0 : queue] ||
+		ec->tx_max_coalesced_frames != priv->tx_coal_frames_set[queue < 0 ? 0 : queue]) &&
+		ec->use_adaptive_tx_coalesce) {
+		netdev_err(dev,
+			"TX interrupt moderation cannot be changed if adaptive-tx is enabled.\n");
+		return -EINVAL;
+	}
+
+	if (ec->tx_coalesce_usecs > DN200_MAX_COAL_TX_TICK)
+		return -EINVAL;
+
+	if (ec->use_adaptive_tx_coalesce) {
+		if (all_queues) {
+			int i;
+
+			for (i = 0; i < tx_cnt; i++) {
+				priv->tx_coal_frames_set[i] =
+				    ec->tx_max_coalesced_frames;
+				priv->tx_coal_frames[i] =
+				    ec->tx_max_coalesced_frames;
+				priv->tx_coal_timer[i] = ec->tx_coalesce_usecs;
+				priv->tx_intr[i].itr_setting |=
+				    DN200_ITR_DYNAMIC_ITR;
+				priv->tx_intr[i].target_itr =
+				    ec->tx_max_coalesced_frames;
+			}
+		} else if (queue < tx_cnt) {
+			priv->tx_coal_frames_set[queue] =
+			    ec->tx_max_coalesced_frames;
+			priv->tx_coal_frames[queue] =
+			    ec->tx_max_coalesced_frames;
+			priv->tx_coal_timer[queue] = ec->tx_coalesce_usecs;
+			priv->tx_intr[queue].itr_setting |=
+			    DN200_ITR_DYNAMIC_ITR;
+			priv->tx_intr[queue].target_itr =
+			    ec->tx_max_coalesced_frames;
+		}
+	} else {
+		if (ec->tx_max_coalesced_frames > DN200_TX_MAX_FRAMES)
+			tx_frame = DN200_TX_MAX_FRAMES;
+		else if (ec->tx_max_coalesced_frames <= 0)
+			tx_frame = 1;
+		else
+			tx_frame = ec->tx_max_coalesced_frames;
+
+		if (all_queues) {
+			int i;
+
+			for (i = 0; i < tx_cnt; i++) {
+				priv->tx_coal_frames[i] = tx_frame;
+				priv->tx_coal_frames_set[i] =
+				    ec->tx_max_coalesced_frames;
+				priv->tx_coal_timer[i] = ec->tx_coalesce_usecs;
+				priv->tx_intr[i].itr_setting &=
+				    ~DN200_ITR_DYNAMIC_ITR;
+				priv->tx_intr[i].target_itr = tx_frame;
+
+			}
+		} else if (queue < tx_cnt) {
+			priv->tx_coal_frames_set[queue] =
+			    ec->tx_max_coalesced_frames;
+			priv->tx_coal_frames[queue] = tx_frame;
+			priv->tx_coal_timer[queue] = ec->tx_coalesce_usecs;
+			priv->tx_intr[queue].itr_setting &=
+			    ~DN200_ITR_DYNAMIC_ITR;
+			priv->tx_intr[queue].target_itr = tx_frame;
+		}
+	}
+
+	return 0;
+}
+
+static int dn200_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *ec,
+			      struct kernel_ethtool_coalesce __maybe_unused *kec,
+			      struct netlink_ext_ack __maybe_unused *extack)
+{
+	return __dn200_set_coalesce(dev, ec, -1);
+}
+
+static int dn200_set_per_queue_coalesce(struct net_device *dev, u32 queue,
+					struct ethtool_coalesce *ec)
+{
+	if (netif_running(dev))
+		return -EBUSY;
+
+	return __dn200_set_coalesce(dev, ec, queue);
+}
+
+static u32 dn200_get_rxfh_key_size(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	return sizeof(priv->rss.key);
+}
+
+static u32 dn200_get_rxfh_indir_size(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	return ARRAY_SIZE(priv->rss.table);
+}
+
+static int dn200_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
+			  u8 *hfunc)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int i;
+
+	if (indir) {
+		for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
+			indir[i] = priv->rss.table[i];
+	}
+
+	if (key)
+		memcpy(key, priv->rss.key, sizeof(priv->rss.key));
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
+	return 0;
+}
+
+static int dn200_set_rxfh(struct net_device *dev, const u32 *indir,
+			  const u8 *key, const u8 hfunc)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int i;
+
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (indir) {
+		for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
+			priv->rss.table[i] = indir[i];
+	}
+
+	if (key)
+		memcpy(priv->rss.key, key, sizeof(priv->rss.key));
+
+	return dn200_rss_configure(priv, priv->hw, &priv->rss,
+				   priv->plat->rx_queues_to_use);
+}
+
+static void dn200_get_max_channels(struct dn200_priv *priv,
+				   unsigned int *rx, unsigned int *tx)
+{
+	unsigned int rx_max, tx_max;
+
+	if (priv->plat_ex->pf_id == 0 || priv->plat_ex->pf_id == 1) {
+		rx_max = 8;
+		tx_max = 8;
+	}
+	if (priv->plat_ex->pf_id == 2 || priv->plat_ex->pf_id == 3) {
+		if (PRIV_SRIOV_SUPPORT(priv)) {
+			rx_max = 1;
+			tx_max = 1;
+		} else {
+			rx_max = 2;
+			tx_max = 2;
+		}
+	}
+	if (PRIV_IS_VF(priv)) {
+		rx_max =
+		    priv->plat_ex->rx_queues_reserved / priv->plat_ex->max_vfs;
+		tx_max =
+		    priv->plat_ex->tx_queues_reserved / priv->plat_ex->max_vfs;
+		tx_max = min_t(unsigned int, DN200_MAX_QPS_PER_VF, tx_max);
+		tx_max = min_t(unsigned int, DN200_MAX_QPS_PER_VF, rx_max);
+	}
+	*rx = rx_max;
+	*tx = tx_max;
+}
+
+static void dn200_get_channels(struct net_device *dev,
+			       struct ethtool_channels *chan)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	unsigned int rx, tx, combined;
+
+	dn200_get_max_channels(priv, &rx, &tx);
+	combined = min(rx, tx);
+	chan->max_combined = combined;
+	rx = priv->plat->rx_queues_to_use;
+	tx = priv->plat->tx_queues_to_use;
+	combined = min(rx, tx);
+	chan->combined_count = combined;
+}
+
+static int dn200_set_channels(struct net_device *dev,
+			      struct ethtool_channels *chan)
+{
+	unsigned int rx, tx, cur_rx, cur_tx, rx_max, tx_max, combined_max;
+	struct dn200_priv *priv = netdev_priv(dev);
+	int i = 0;
+	struct dn200_fdir_filter *input;
+	int err = 0;
+
+	dn200_get_max_channels(priv, &rx_max, &tx_max);
+	combined_max = min(rx_max, tx_max);
+	/* Should not be setting other count */
+	if (chan->other_count) {
+		netdev_err(dev, "other channel count must be zero\n");
+		return -EINVAL;
+	}
+
+	/* Require at least one Combined (Rx and Tx) channel */
+	if (!chan->combined_count) {
+		netdev_err(dev,
+			   "at least one combined Rx/Tx channel is required\n");
+		return -EINVAL;
+	}
+
+	/* Check combined channels */
+	if (chan->combined_count > combined_max) {
+		netdev_err(dev,
+			   "combined channel count cannot exceed %u\n",
+			   combined_max);
+		return -EINVAL;
+	}
+
+	/* Can have some Rx-only or Tx-only channels, but not both */
+	if (chan->rx_count || chan->tx_count) {
+		netdev_err(dev, "cannot specify Rx or Tx channels\n");
+		return -EINVAL;
+	}
+
+	for (; i < priv->flow_entries_max - 4; i++) {
+		input = &priv->fdir_enties[i];
+		if (input && input->enable &&
+		    (input->action & DN200_FLOW_ACTION_ROUTE) &&
+		    input->queue >= chan->combined_count) {
+			netdev_warn(dev,
+				    "Existing user defined filter %d assigns flow to queue %d\n",
+				    i, input->queue);
+			err = -EINVAL;
+		}
+	}
+	if (err) {
+		netdev_err(dev,
+			   "Existing filter rules must be deleted to reduce combined channel count to %d\n",
+			   chan->combined_count);
+		return err;
+	}
+
+	rx = chan->combined_count;
+	tx = chan->combined_count;
+
+	cur_rx = priv->plat->rx_queues_to_use;
+	cur_tx = priv->plat->tx_queues_to_use;
+
+	if (rx == cur_rx && tx == cur_tx)
+		goto out;
+	return dn200_reinit_queues(dev, rx, tx);
+out:
+	return 0;
+}
+
+static int dn200_get_ts_info(struct net_device *dev,
+			     struct ethtool_ts_info *info)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if ((!PRIV_IS_VF(priv))
+	    && (priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp)) {
+
+		info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+		    SOF_TIMESTAMPING_TX_HARDWARE |
+		    SOF_TIMESTAMPING_RX_SOFTWARE |
+		    SOF_TIMESTAMPING_RX_HARDWARE |
+		    SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_RAW_HARDWARE;
+
+		if (priv->ptp_clock)
+			info->phc_index = ptp_clock_index(priv->ptp_clock);
+
+		info->tx_types = (1 << HWTSTAMP_TX_OFF) | (1 << HWTSTAMP_TX_ON);
+
+		info->rx_filters = ((1 << HWTSTAMP_FILTER_NONE) |
+				    (1 << HWTSTAMP_FILTER_PTP_V1_L4_EVENT) |
+				    (1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+				    (1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) |
+				    (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
+				    (1 << HWTSTAMP_FILTER_ALL));
+		return 0;
+	} else {
+		return ethtool_op_get_ts_info(dev, info);
+	}
+}
+
+static int dn200_set_priv_flags(struct net_device *dev, u32 flags)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 i, orig_flags, new_flags, changed_flags;
+	int ret = 0;
+
+	/*needs according to priv->flags to operate */
+	orig_flags = priv->eth_priv_flags;
+	new_flags = orig_flags;
+	if (PRIV_IS_VF(priv))
+		return -EOPNOTSUPP;
+	for (i = 0; i < DN200_PRIV_FLAGS_STR_LEN; i++) {
+		const struct dn200_priv_flags *priv_flags;
+
+		priv_flags = &dn200_gstrings_priv_flags[i];
+
+		if (flags & BIT(i))
+			new_flags |= priv_flags->flag;
+		else
+			new_flags &= ~(priv_flags->flag);
+
+		/* If this is a read-only flag, it can't be changed */
+		if (priv_flags->read_only &&
+		    ((orig_flags ^ new_flags) & ~BIT(i)))
+			return -EOPNOTSUPP;
+	}
+	changed_flags = orig_flags ^ new_flags;
+
+	if (!changed_flags)
+		return 0;
+
+	if (changed_flags & DN200_PRIV_FLAGS_FEC_EN) {
+		ret = dn200_phy_fec_enable(dev,
+			!!(new_flags & DN200_PRIV_FLAGS_FEC_EN));
+		if (ret)
+			return ret;
+
+		if (new_flags & DN200_PRIV_FLAGS_FEC_EN)
+			priv->eth_priv_flags |= DN200_PRIV_FLAGS_FEC_EN;
+		else
+			priv->eth_priv_flags &= ~DN200_PRIV_FLAGS_FEC_EN;
+	}
+
+	if (changed_flags & DN200_PRIV_FLAGS_PTP_RXTX) {
+		ret = dn200_reinit_hwts(dev, !(orig_flags & DN200_PRIV_FLAGS_PTP_RXTX),
+				      new_flags);
+	}
+
+	return ret;
+}
+
+static u32 dn200_get_priv_flags(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 i, ret_flags = 0;
+
+	if (PRIV_IS_VF(priv))
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < DN200_PRIV_FLAGS_STR_LEN; i++) {
+		const struct dn200_priv_flags *priv_flags;
+
+		priv_flags = &dn200_gstrings_priv_flags[i];
+
+		if (priv_flags->flag & priv->eth_priv_flags)
+			ret_flags |= BIT(i);
+	}
+	return ret_flags;
+}
+
+static int dn200_set_phys_id(struct net_device *dev,
+			     enum ethtool_phys_id_state state)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+
+	if (!phy_info->phy_ops->led_control)
+		return -EOPNOTSUPP;
+	if (!phy_info->phy_ops->blink_control)
+		return -EOPNOTSUPP;
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		return 2;
+	case ETHTOOL_ID_ON:
+		if (phy_info->phydev)
+			extern_phy_force_led(phy_info->phydev, priv, 1, 0);
+		else
+			dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_ENABLE);
+		break;
+	case ETHTOOL_ID_OFF:
+		if (phy_info->phydev)
+			extern_phy_force_led(phy_info->phydev, priv, 1, 1);
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		if (netif_running(dev)) {
+			priv->blink_state_last = BLINK_ENABLE;
+			if (!phy_info->phydev)
+				dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_ENABLE);
+		} else
+			if (!phy_info->phydev)
+				dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_DISABLE);
+
+		if (phy_info->phydev) {
+			if (phy_info->phydev->link && phy_info->phydev->speed == SPEED_1000)
+				extern_phy_force_led(phy_info->phydev, priv, 1, 0);
+			else if (phy_info->phydev->link && phy_info->phydev->speed == SPEED_100)
+				extern_phy_force_led(phy_info->phydev, priv, 2, 0);
+			else
+				extern_phy_force_led(phy_info->phydev, priv, 1, 1);
+		}
+		break;
+	}
+	return 0;
+}
+
+static int dn200_get_rss_hash_opts(struct dn200_priv *priv,
+				   struct ethtool_rxnfc *cmd)
+{
+	u32 flags = priv->rss.rss_flags;
+
+	cmd->data = 0;
+
+	/* Report default options for RSS on dn200 */
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+		if (flags & DN200_RSS_TCP4TE)
+			cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		fallthrough;
+	case IPV4_FLOW:
+		if (flags & DN200_RSS_IP2TE)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+		break;
+	case UDP_V6_FLOW:
+	case UDP_V4_FLOW:
+		if (flags & DN200_RSS_UDP4TE)
+			cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		fallthrough;
+	case IPV6_FLOW:
+		if (flags & DN200_RSS_IP2TE)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int dn200_get_fdir_entry(struct dn200_priv *priv,
+				struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp;
+	struct dn200_fdir_filter *input;
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	if (fsp->location >= (priv->flow_entries_max - 4))
+		return -EINVAL;
+	input = &priv->fdir_enties[fsp->location];
+	if (!input->enable)
+		return -EINVAL;
+
+	/* set flow type field */
+	if (input->flow_type & DN200_FLOW_TYPE_V4) {
+		if (input->flow_type & DN200_FLOW_TYPE_UDP)
+			fsp->flow_type = UDP_V4_FLOW;
+		else if (input->flow_type & DN200_FLOW_TYPE_TCP)
+			fsp->flow_type = TCP_V4_FLOW;
+		else
+			fsp->flow_type = IPV4_USER_FLOW;
+		fsp->h_u.tcp_ip4_spec.psrc = input->src_port;
+		fsp->h_u.tcp_ip4_spec.pdst = input->dst_port;
+		fsp->m_u.tcp_ip4_spec.psrc = 0;
+		fsp->m_u.tcp_ip4_spec.pdst = 0;
+		fsp->h_u.tcp_ip4_spec.ip4src = input->src_ip;
+		fsp->m_u.tcp_ip4_spec.ip4src = input->src_ip_mask;
+		fsp->h_u.tcp_ip4_spec.ip4dst = input->dst_ip;
+		fsp->m_u.tcp_ip4_spec.ip4dst = input->dst_ip_mask;
+		fsp->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4;
+	} else {
+		if (input->flow_type & DN200_FLOW_TYPE_UDP)
+			fsp->flow_type = UDP_V6_FLOW;
+		else if (input->flow_type & DN200_FLOW_TYPE_TCP)
+			fsp->flow_type = TCP_V6_FLOW;
+		else
+			fsp->flow_type = IPV6_USER_FLOW;
+		memset(fsp->h_u.tcp_ip6_spec.ip6src, 0, sizeof(__be32) * 4);
+		memset(fsp->m_u.tcp_ip6_spec.ip6src, 0, sizeof(__be32) * 4);
+		memset(fsp->h_u.tcp_ip6_spec.ip6dst, 0, sizeof(__be32) * 4);
+		memset(fsp->m_u.tcp_ip6_spec.ip6dst, 0, sizeof(__be32) * 4);
+		fsp->h_u.tcp_ip6_spec.psrc = input->src_port;
+		fsp->h_u.tcp_ip6_spec.pdst = input->dst_port;
+
+		if ((input->flow_type & DN200_FLOW_TYPE_V6)
+		    && (input->flow_type & DN200_FLOW_TYPE_SA)) {
+			memcpy(fsp->h_u.tcp_ip6_spec.ip6src, input->ip6,
+			       sizeof(__be32) * 4);
+			memcpy(fsp->m_u.tcp_ip6_spec.ip6src, input->ip6_mask,
+			       sizeof(__be32) * 4);
+		} else if ((input->flow_type & DN200_FLOW_TYPE_V6)
+			   && (input->flow_type & DN200_FLOW_TYPE_DA)) {
+			memcpy(fsp->h_u.tcp_ip6_spec.ip6dst, input->ip6,
+			       sizeof(__be32) * 4);
+			memcpy(fsp->m_u.tcp_ip6_spec.ip6dst, input->ip6_mask,
+			       sizeof(__be32) * 4);
+		}
+	}
+	/* record action */
+	if (input->action == DN200_FLOW_ACTION_DROP)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
+	else
+		fsp->ring_cookie = input->queue;
+	return 0;
+}
+
+static int dn200_get_fdir_all(struct dn200_priv *priv,
+			      struct ethtool_rxnfc *cmd, u32 *rule_locs)
+{
+	struct dn200_fdir_filter *input;
+	int cnt = 0;
+	int i = priv->flow_entries_max - 5;
+
+	/* report total rule count */
+	cmd->data = priv->flow_entries_max - 4;
+	for (; i >= 0; i--) {
+		input = &priv->fdir_enties[i];
+		if (input->enable) {
+			rule_locs[cnt] = i;
+			cnt++;
+		}
+	}
+	cmd->rule_cnt = cnt;
+	return 0;
+}
+
+static int dn200_get_rxnfc(struct net_device *netdev,
+			   struct ethtool_rxnfc *cmd, u32 *rule_locs)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = priv->plat->rx_queues_to_use;
+		return 0;
+	case ETHTOOL_GRXFH:
+		return dn200_get_rss_hash_opts(priv, cmd);
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = priv->fdir_counts;
+		return 0;
+	case ETHTOOL_GRXCLSRULE:
+		return dn200_get_fdir_entry(priv, cmd);
+	case ETHTOOL_GRXCLSRLALL:
+		return dn200_get_fdir_all(priv, cmd, rule_locs);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dn200_set_rss_hash_opt(struct dn200_priv *priv,
+				  struct ethtool_rxnfc *nfc)
+{
+	u32 flags = priv->rss.rss_flags;
+
+	/**
+	 * RSS does not support anything other than hashing
+	 * to queues on src and dst IPs and ports
+	 */
+	if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
+			  RXH_L4_B_0_1 | RXH_L4_B_2_3))
+		return -EINVAL;
+
+	switch (nfc->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+		if (!(nfc->data & (RXH_IP_SRC | RXH_IP_DST)) &&
+		    !(nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)))
+			return -EINVAL;
+		flags &= ~(DN200_RSS_IP2TE | DN200_RSS_TCP4TE);
+		if (nfc->data & (RXH_IP_SRC | RXH_IP_DST))
+			flags |= DN200_RSS_IP2TE;
+		if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3))
+			flags |= DN200_RSS_TCP4TE;
+		break;
+	case UDP_V4_FLOW:
+	case UDP_V6_FLOW:
+		if (!(nfc->data & (RXH_IP_SRC | RXH_IP_DST)) &&
+		    !(nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)))
+			return -EINVAL;
+		flags &= ~(DN200_RSS_IP2TE | DN200_RSS_UDP4TE);
+		if (nfc->data & (RXH_IP_SRC | RXH_IP_DST))
+			flags |= DN200_RSS_IP2TE;
+		if (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3))
+			flags |= DN200_RSS_UDP4TE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* if we changed something we need to update flags */
+	if (flags != priv->rss.rss_flags) {
+		priv->rss.rss_flags = flags;
+		dn200_rss_configure(priv, priv->hw, &priv->rss,
+				    priv->plat->rx_queues_to_use);
+	}
+
+	return 0;
+}
+
+static int dn200_check_flow_type_supported(struct ethtool_rx_flow_spec *fsp)
+{
+	switch (fsp->flow_type) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case IPV4_USER_FLOW:
+	case IPV6_USER_FLOW:
+		return 1;
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+		return 1;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static int dn200_check_flow_type_conflict(struct ethtool_rx_flow_spec *fsp,
+					  struct dn200_fdir_info *fdir_info)
+{
+	switch (fsp->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+		if (fdir_info->l4_udp_count)
+			return 1;
+		break;
+	case UDP_V6_FLOW:
+	case UDP_V4_FLOW:
+		if (fdir_info->l4_tcp_count)
+			return 1;
+		break;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static int dn200_ethtool_mask_to_xgmac_mask(struct dn200_priv *priv, u32 mask)
+{
+	u32 mask_tmp = 0xffffffff;
+	u32 i = 0;
+
+	if (!mask)
+		return 32;
+	for (; mask_tmp;) {
+		if (mask == mask_tmp)
+			return i;
+		i++;
+		mask_tmp = (mask_tmp << 1);
+	}
+	netdev_err(priv->dev, "Unsupported ip mask\n");
+	return -1;
+}
+
+static bool dn200_match_fdir_filter(struct dn200_fdir_filter *a,
+				    struct dn200_fdir_filter *b)
+{
+	/* The filters do not much if any of these criteria differ. */
+	if (a->dst_port != b->dst_port ||
+	    a->src_port != b->src_port ||
+	    a->flow_type != b->flow_type ||
+	    ((a->flow_type & DN200_FLOW_TYPE_V4) &&
+	     (a->dst_ip != b->dst_ip ||
+	      a->src_ip != b->src_ip ||
+	      a->xgmac_mask_src != b->xgmac_mask_src ||
+	      a->xgmac_mask_dst != b->xgmac_mask_dst)) ||
+	    ((a->flow_type & DN200_FLOW_TYPE_V6) &&
+	     (a->ip6_address != b->ip6_address ||
+	      a->ip6[0] != b->ip6[0] ||
+	      a->ip6[1] != b->ip6[1] ||
+	      a->ip6[2] != b->ip6[2] ||
+	      a->ip6[3] != b->ip6[3] ||
+	      a->ip6_mask[0] != b->ip6_mask[0] ||
+	      a->ip6_mask[1] != b->ip6_mask[1] ||
+	      a->ip6_mask[2] != b->ip6_mask[2] ||
+	      a->ip6_mask[3] != b->ip6_mask[3])))
+		return false;
+
+	return true;
+}
+
+static int dn200_disallow_matching_filters(struct dn200_priv *priv,
+					   struct dn200_fdir_filter *input)
+{
+	struct dn200_fdir_filter *tmp;
+	int i = priv->flow_entries_max - 5;
+
+	for (; i >= 0; i--) {
+		tmp = &priv->fdir_enties[i];
+		if (i == input->reg_idx)
+			continue;
+		if (input->enable)
+			continue;
+		if (dn200_match_fdir_filter(tmp, input)) {
+			netdev_err(priv->dev,
+				"Existing user defined filter %d already matches this flow.\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+#define XGMAC_NTUPLE_MAX_V4_ADDR_MASK_BITS (31)
+#define XGMAC_NTUPLE_MAX_V6_ADDR_MASK_BITS (127)
+static int dn200_add_fdir_ethtool(struct dn200_priv *priv,
+				  struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp;
+	u8 queue = 0;
+	u8 action = DN200_FLOW_ACTION_ROUTE;
+	struct dn200_fdir_filter *input;
+	u8 ip6_address = 0;
+	int mask;
+	int ret = 0;
+
+	if (!(priv->dev->features & NETIF_F_NTUPLE)) {
+		netdev_err(priv->dev, "Cannot configure new rule when ntuple is disabled\n");
+		return -EOPNOTSUPP;
+	}
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	/* Extended MAC field or vlan field are not supported */
+	if ((fsp->flow_type & FLOW_MAC_EXT) || (fsp->flow_type & FLOW_EXT))
+		return -EINVAL;
+	/* Don't allow indexes to exist outside of available space */
+	if (fsp->location >= (priv->flow_entries_max - 4)) {
+		netdev_err(priv->dev, "Location out of range\n");
+		return -EINVAL;
+	}
+
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		action = DN200_FLOW_ACTION_DROP;
+	} else {
+		u32 ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
+
+		if (ring >= priv->plat->rx_queues_to_use)
+			return -EINVAL;
+		queue = ring;
+	}
+
+	/* record flow type */
+	if (!dn200_check_flow_type_supported(fsp)) {
+		netdev_err(priv->dev, "Unrecognized flow type: %d\n",
+			   fsp->flow_type);
+		return -EINVAL;
+	}
+
+	/*Whether a type conflict exists */
+	if (dn200_check_flow_type_conflict(fsp, &priv->fdir_info)) {
+		netdev_err(priv->dev, "Conflict flow type existed\n");
+		return -EINVAL;
+	}
+	input = &priv->fdir_enties[fsp->location];
+	if (input->enable) {
+		netdev_err(priv->dev, "Location(%d) is enabled\n",
+			   fsp->location);
+		return -EINVAL;
+	}
+	memset(input, 0, sizeof(*input));
+	if (fsp->flow_type == UDP_V6_FLOW || fsp->flow_type == UDP_V4_FLOW) {
+		input->flow_type |= DN200_FLOW_TYPE_UDP;
+	} else if (fsp->flow_type == TCP_V6_FLOW
+		   || fsp->flow_type == TCP_V4_FLOW) {
+		input->flow_type |= DN200_FLOW_TYPE_TCP;
+	}
+	if (fsp->flow_type == UDP_V6_FLOW ||
+	    fsp->flow_type == TCP_V6_FLOW || fsp->flow_type == IPV6_USER_FLOW) {
+		/* Reverse the src and dest notion, since the HW expects them
+		 * to be from Tx perspective where as the input from user is
+		 * from Rx filter view.
+		 */
+		ip6_address = 0;
+		if ((fsp->h_u.tcp_ip6_spec.ip6src[0] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6src[1] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6src[2] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6src[3] != 0)
+		    ) {
+			ip6_address |= DN200_L3L4_IPV6_SA;
+		}
+
+		if ((fsp->h_u.tcp_ip6_spec.ip6dst[0] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6dst[1] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6dst[2] != 0 ||
+		     fsp->h_u.tcp_ip6_spec.ip6dst[3] != 0)
+		    ) {
+			ip6_address |= DN200_L3L4_IPV6_DA;
+		}
+		if (ip6_address == (DN200_L3L4_IPV6_SA | DN200_L3L4_IPV6_DA)) {
+			netdev_err(priv->dev, "Ipv6 only support source or dest ip address\n");
+			return -EINVAL;
+		}
+		input->flow_type |= DN200_FLOW_TYPE_V6;
+		input->reg_idx = fsp->location;
+		if (fsp->flow_type == IPV6_USER_FLOW) {
+			if (fsp->h_u.usr_ip6_spec.l4_4_bytes) {
+				netdev_err(priv->dev, "Ipv6 User spec not support L4 config\n");
+				return -EOPNOTSUPP;
+			}
+		} else {
+			input->dst_port = fsp->h_u.tcp_ip6_spec.pdst;
+			input->src_port = fsp->h_u.tcp_ip6_spec.psrc;
+		}
+		if (input->dst_port) {
+			if (fsp->m_u.tcp_ip6_spec.pdst != 0xFFFF) {
+				netdev_err(priv->dev,
+					   "The port mask is unsupported\n");
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_DPORT;
+		}
+		if (input->src_port) {
+			if (fsp->m_u.tcp_ip6_spec.psrc != 0xFFFF) {
+				netdev_err(priv->dev,
+					   "The port mask is unsupported\n");
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_SPORT;
+		}
+
+		if (ip6_address & DN200_L3L4_IPV6_SA) {
+			input->flow_type |= DN200_FLOW_TYPE_SA;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6src[3]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src = mask;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6src[2]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6src[1]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6src[0]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+			if (input->xgmac_mask_src >
+			    XGMAC_NTUPLE_MAX_V6_ADDR_MASK_BITS)
+				netdev_err(priv->dev, "Mask is not supported at the highest bit\n");
+			memcpy(input->ip6, fsp->h_u.tcp_ip6_spec.ip6src,
+			       sizeof(__be32) * 4);
+			memcpy(input->ip6_mask, fsp->m_u.tcp_ip6_spec.ip6src,
+			       sizeof(__be32) * 4);
+		} else if (ip6_address & DN200_L3L4_IPV6_DA) {
+			input->flow_type |= DN200_FLOW_TYPE_DA;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6dst[3]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src = mask;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6dst[2]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6dst[1]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+			mask = dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(fsp->m_u.tcp_ip6_spec.ip6dst[0]));
+			if (mask == -1)
+				return -EINVAL;
+			input->xgmac_mask_src += mask;
+			if (input->xgmac_mask_src >
+			    XGMAC_NTUPLE_MAX_V6_ADDR_MASK_BITS)
+				netdev_err(priv->dev, "Mask is not supported at the highest bit\n");
+			memcpy(input->ip6, fsp->h_u.tcp_ip6_spec.ip6dst,
+			       sizeof(__be32) * 4);
+			memcpy(input->ip6_mask, fsp->m_u.tcp_ip6_spec.ip6dst,
+			       sizeof(__be32) * 4);
+		}
+	} else {
+		input->flow_type |= DN200_FLOW_TYPE_V4;
+		if (fsp->flow_type == IPV4_USER_FLOW) {
+			if (fsp->h_u.usr_ip4_spec.l4_4_bytes) {
+				netdev_err(priv->dev, "Ipv4 User spec not support L4 config\n");
+				return -EOPNOTSUPP;
+			}
+		} else {
+			input->dst_port = fsp->h_u.tcp_ip4_spec.pdst;
+			input->src_port = fsp->h_u.tcp_ip4_spec.psrc;
+		}
+		input->reg_idx = fsp->location;
+		input->dst_ip = (fsp->h_u.tcp_ip4_spec.ip4dst);
+		input->dst_ip_mask = (fsp->m_u.tcp_ip4_spec.ip4dst);
+		input->src_ip = (fsp->h_u.tcp_ip4_spec.ip4src);
+		input->src_ip_mask = (fsp->m_u.tcp_ip4_spec.ip4src);
+		if (input->dst_port) {
+			if (fsp->m_u.tcp_ip4_spec.pdst != 0xFFFF) {
+				netdev_err(priv->dev,
+					   "The port mask is unsupported\n");
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_DPORT;
+		}
+		if (input->src_port) {
+			if (fsp->m_u.tcp_ip4_spec.psrc != 0xFFFF) {
+				netdev_err(priv->dev,
+					   "The port mask is unsupported\n");
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_SPORT;
+		}
+		if (input->dst_ip) {
+			input->xgmac_mask_dst =
+				dn200_ethtool_mask_to_xgmac_mask(priv, ntohl(input->dst_ip_mask));
+			if (input->xgmac_mask_dst >
+				    XGMAC_NTUPLE_MAX_V4_ADDR_MASK_BITS ||
+					input->xgmac_mask_dst == -1) {
+				if (input->xgmac_mask_dst >
+				    XGMAC_NTUPLE_MAX_V4_ADDR_MASK_BITS)
+					netdev_err(priv->dev, "Mask is not supported at the highest bit\n");
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_DA;
+		}
+		if (input->src_ip) {
+			input->xgmac_mask_src =
+			    dn200_ethtool_mask_to_xgmac_mask(priv,
+							     ntohl(input->src_ip_mask));
+			if (input->xgmac_mask_src > XGMAC_NTUPLE_MAX_V4_ADDR_MASK_BITS
+			    || input->xgmac_mask_src == -1) {
+				if (input->xgmac_mask_src >
+				    XGMAC_NTUPLE_MAX_V4_ADDR_MASK_BITS) {
+					netdev_err(priv->dev, "Mask is not supported at the highest bit\n");
+				}
+				return -EINVAL;
+			}
+			input->flow_type |= DN200_FLOW_TYPE_SA;
+		}
+	}
+	if (fsp->flow_type == UDP_V6_FLOW ||
+	    fsp->flow_type == TCP_V6_FLOW ||
+	    fsp->flow_type == UDP_V4_FLOW || fsp->flow_type == TCP_V4_FLOW) {
+		if (!(input->flow_type & (DN200_FLOW_TYPE_DPORT |
+				  DN200_FLOW_TYPE_SPORT))) {
+			netdev_err(priv->dev,
+				   "L4 requires src-port/dst-port configuration\n");
+			return -EINVAL;
+		}
+	}
+	ret = dn200_disallow_matching_filters(priv, input);
+	if (ret) {
+		memset(input, 0, sizeof(*input));
+		return ret;
+	}
+	input->action = action;
+	input->queue = queue;
+	ret = dn200_config_ntuple_filter(priv, priv->hw, fsp->location, input,
+				       true);
+	if (ret) {
+		memset(input, 0, sizeof(*input));
+		return ret;
+	}
+	if (action == DN200_FLOW_ACTION_DROP)
+		priv->fdir_map |= (1 << fsp->location);
+
+	if (input->flow_type & DN200_FLOW_TYPE_UDP)
+		priv->fdir_info.l4_udp_count++;
+	else if (input->flow_type & DN200_FLOW_TYPE_TCP)
+		priv->fdir_info.l4_tcp_count++;
+
+	input->enable = true;
+	priv->fdir_counts++;
+	return 0;
+}
+
+static int dn200_del_fdir_ethtool(struct dn200_priv *priv,
+				  struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp;
+	struct dn200_fdir_filter *input;
+	int ret = 0;
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	/* Don't allow indexes to exist outside of available space */
+	if (fsp->location >= (priv->flow_entries_max - 4)) {
+		netdev_err(priv->dev, "Location out of range\n");
+		return -EINVAL;
+	}
+
+	input = &priv->fdir_enties[fsp->location];
+	if (!input->enable) {
+		netdev_err(priv->dev, "Location %d is not enabled\n",
+			   fsp->location);
+		return -EINVAL;
+	}
+
+	ret = dn200_config_ntuple_filter(priv, priv->hw, fsp->location, input,
+				       false);
+	if (ret)
+		return ret;
+	/*Delete the udp/tcp entry count */
+	if (input->flow_type & DN200_FLOW_TYPE_UDP)
+		priv->fdir_info.l4_udp_count--;
+	else if (input->flow_type & DN200_FLOW_TYPE_TCP)
+		priv->fdir_info.l4_tcp_count--;
+
+	memset(input, 0, sizeof(*input));
+	priv->fdir_map &= ~(1 << fsp->location);
+	priv->fdir_counts--;
+	return 0;
+}
+
+static int dn200_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (HW_IS_VF(priv->hw))
+		return -EOPNOTSUPP;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		return dn200_set_rss_hash_opt(priv, cmd);
+	case ETHTOOL_SRXCLSRLINS:
+		return dn200_add_fdir_ethtool(priv, cmd);
+	case ETHTOOL_SRXCLSRLDEL:
+		return dn200_del_fdir_ethtool(priv, cmd);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dn200_get_module_info(struct net_device *netdev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	int status;
+	u8 sff8472_rev, addr_mode;
+	bool page_swap = false;
+
+	/*VF do not support to access i2c */
+	if (HW_IS_VF(priv->hw) || phy_info->phydev)
+		return -EOPNOTSUPP;
+	/*SFP module absent */
+	if (phy_info->sfp_mod_absent) {
+		netdev_err(netdev, "sfp module not present\n");
+		return -EOPNOTSUPP;
+	}
+	/*only fibre interface support get sfp module info */
+	if (!phy_info->xpcs || !phy_info->xpcs_sfp_valid) {
+		netdev_err(netdev, "XPCS I2C not valid\n");
+		return -EOPNOTSUPP;
+	}
+	/* Check whether we support SFF-8472 or not */
+	status = phy_info->phy_ops->read_i2c_eeprom(phy_info,
+		DN200_SFF_SFF_8472_COMP, &sff8472_rev);
+	if (status)
+		return -EIO;
+
+	/* addressing mode is not supported */
+	status = phy_info->phy_ops->read_i2c_eeprom(phy_info,
+		DN200_SFF_SFF_8472_SWAP, &addr_mode);
+	if (status)
+		return -EIO;
+
+	if (addr_mode & DN200_SFF_ADDRESSING_MODE) {
+		netdev_err(priv->dev,
+			   "Address change required to access page 0xA2, but not supported. Please report the module type to the driver maintainers.\n");
+		page_swap = true;
+	}
+
+	if (sff8472_rev == DN200_SFF_SFF_8472_UNSUP || page_swap ||
+	    !(addr_mode & DN200_SFF_DDM_IMPLEMENTED)) {
+		/* We have a SFP, but it does not support SFF-8472 */
+		modinfo->type = ETH_MODULE_SFF_8079;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+	} else {
+		/* We have a SFP which supports a revision of SFF-8472. */
+		modinfo->type = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+	}
+
+	return 0;
+}
+
+static int dn200_get_module_eeprom(struct net_device *netdev,
+				   struct ethtool_eeprom *ee, u8 *data)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	int status = 0;
+	u8 databyte = 0xFF;
+	int i = 0;
+
+	/*VF do not support access i2c */
+	if (HW_IS_VF(priv->hw) || phy_info->phydev)
+		return -EOPNOTSUPP;
+	/*SFP module absent */
+	if (phy_info->sfp_mod_absent) {
+		netdev_err(netdev, "sfp module not present\n");
+		return -EOPNOTSUPP;
+	}
+	/*only fibre interface support get sfp module info */
+	if (!phy_info->xpcs || !phy_info->xpcs_sfp_valid) {
+		netdev_err(netdev, "XPCS I2C not valid\n");
+		return -EOPNOTSUPP;
+	}
+
+	for (i = ee->offset; i < ee->offset + ee->len; i++) {
+		/* I2C reads can take long time */
+		if (test_bit(DN200_SFP_IN_INIT, &priv->state))
+			return -EBUSY;
+
+		if (i < ETH_MODULE_SFF_8079_LEN)
+			status = phy_info->phy_ops->read_i2c_eeprom(phy_info, i,
+							       &databyte);
+		else
+			status = phy_info->phy_ops->read_i2c_sff8472(phy_info, i,
+								&databyte);
+
+		if (status)
+			return -EIO;
+
+		data[i - ee->offset] = databyte;
+	}
+	return 0;
+}
+
+static void dn200_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	wol->wolopts = 0;
+	wol->supported = 0;
+}
+
+static int dn200_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	return -EOPNOTSUPP;
+}
+
+#define DN200_EEPROM_SIZE 512
+static int dn200_get_eeprom_len(struct net_device *netdev)
+{
+	return DN200_EEPROM_SIZE;
+}
+
+static int dn200_get_eeprom(struct net_device *netdev,
+		struct ethtool_eeprom *eeprom, u8 *bytes)
+{
+	int ret;
+	u8 *data;
+	struct pci_dev *pdev;
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (!eeprom->len || eeprom->offset + eeprom->len > DN200_EEPROM_SIZE)
+		return -EINVAL;
+
+	pdev = container_of(priv->device, struct pci_dev, dev);
+	eeprom->magic = pdev->vendor | (pdev->device << 16);
+
+	data = kmalloc(DN200_EEPROM_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = dn200_eeprom_read(&priv->plat_ex->ctrl, 0, DN200_EEPROM_SIZE, data);
+	if (!ret)
+		memcpy(bytes, data + eeprom->offset, eeprom->len);
+
+	kfree(data);
+
+	return ret;
+}
+
+static int dn200_set_eeprom(struct net_device *netdev,
+		struct ethtool_eeprom *eeprom, u8 *bytes)
+{
+	int ret;
+	u8 *data;
+	struct pci_dev *pdev;
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (!eeprom->len || eeprom->offset + eeprom->len > DN200_EEPROM_SIZE)
+		return -EINVAL;
+
+	pdev = container_of(priv->device, struct pci_dev, dev);
+	if (eeprom->magic != (pdev->vendor | (pdev->device << 16)))
+		return -EFAULT;
+
+	data = kmalloc(DN200_EEPROM_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = dn200_eeprom_read(&priv->plat_ex->ctrl, 0, DN200_EEPROM_SIZE, data);
+	if (ret) {
+		kfree(data);
+		return ret;
+	}
+
+	memcpy(data + eeprom->offset, bytes, eeprom->len);
+
+	ret = dn200_eeprom_write(&priv->plat_ex->ctrl, 0, DN200_EEPROM_SIZE, data);
+	kfree(data);
+
+	return ret;
+}
+
+static const struct ethtool_ops dn200_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+	    ETHTOOL_COALESCE_MAX_FRAMES |
+	    ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX,
+	.get_drvinfo = dn200_ethtool_getdrvinfo,
+	.get_msglevel = dn200_ethtool_getmsglevel,
+	.set_msglevel = dn200_ethtool_setmsglevel,
+	.get_regs = dn200_ethtool_gregs,
+	.get_regs_len = dn200_ethtool_get_regs_len,
+	.get_link = ethtool_op_get_link,
+	.nway_reset = dn200_nway_reset,
+	.get_ringparam = dn200_get_ringparam,
+	.set_ringparam = dn200_set_ringparam,
+	.get_pauseparam = dn200_get_pauseparam,
+	.set_pauseparam = dn200_set_pauseparam,
+	.self_test = dn200_selftest_run,
+	.set_phys_id = dn200_set_phys_id,
+	.get_ethtool_stats = dn200_get_ethtool_stats,
+	.get_strings = dn200_get_strings,
+	.get_eee = dn200_ethtool_op_get_eee,
+	.set_eee = dn200_ethtool_op_set_eee,
+	.get_sset_count = dn200_get_sset_count,
+	.get_rxnfc = dn200_get_rxnfc,
+	.set_rxnfc = dn200_set_rxnfc,
+	.get_rxfh_key_size = dn200_get_rxfh_key_size,
+	.get_rxfh_indir_size = dn200_get_rxfh_indir_size,
+	.get_rxfh = dn200_get_rxfh,
+	.set_rxfh = dn200_set_rxfh,
+	.get_ts_info = dn200_get_ts_info,
+	.get_coalesce = dn200_get_coalesce,
+	.set_coalesce = dn200_set_coalesce,
+	.get_per_queue_coalesce = dn200_get_per_queue_coalesce,
+	.set_per_queue_coalesce = dn200_set_per_queue_coalesce,
+	.get_channels = dn200_get_channels,
+	.set_channels = dn200_set_channels,
+	.get_priv_flags = dn200_get_priv_flags,
+	.set_priv_flags = dn200_set_priv_flags,
+	.get_link_ksettings = dn200_ethtool_get_link_ksettings,
+	.set_link_ksettings = dn200_ethtool_set_link_ksettings,
+	.get_module_info = dn200_get_module_info,
+	.get_module_eeprom = dn200_get_module_eeprom,
+	.flash_device = dn200_load_firmware,
+	.get_wol = dn200_get_wol,
+	.set_wol = dn200_set_wol,
+	.get_eeprom_len = dn200_get_eeprom_len,
+	.get_eeprom = dn200_get_eeprom,
+	.set_eeprom = dn200_set_eeprom,
+};
+
+void dn200_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &dn200_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_hwtstamp.c b/drivers/net/ethernet/dapustor/dn200/dn200_hwtstamp.c
new file mode 100644
index 000000000000..09d4a424b500
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_hwtstamp.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/delay.h>
+#include <linux/ptp_clock_kernel.h>
+#include "common.h"
+#include "dn200_ptp.h"
+#include "dn200.h"
+
+static void config_hw_tstamping(void __iomem *ioaddr, u32 data)
+{
+	writel(data, ioaddr + PTP_TCR);
+}
+
+static void config_sub_second_increment(void __iomem *ioaddr,
+					u32 ptp_clock, int gmac4, u32 *ssinc)
+{
+	u32 value = readl(ioaddr + PTP_TCR);
+	unsigned long data;
+	u32 reg_value;
+
+	/* For GMAC3.x, 4.x versions, in "fine adjustement mode" set sub-second
+	 * increment to twice the number of nanoseconds of a clock cycle.
+	 * The calculation of the default_addend value by the caller will set it
+	 * to mid-range = 2^31 when the remainder of this division is zero,
+	 * which will make the accumulator overflow once every 2 ptp_clock
+	 * cycles, adding twice the number of nanoseconds of a clock cycle :
+	 * 2000000000ULL / ptp_clock.
+	 */
+	if (value & PTP_TCR_TSCFUPDT)
+		data = (2000000000ULL / ptp_clock);
+	else
+		data = (1000000000ULL / ptp_clock);
+
+	/* 0.465ns accuracy */
+	if (!(value & PTP_TCR_TSCTRLSSR))
+		data = (data * 1000) / 465;
+
+	data &= PTP_SSIR_SSINC_MASK;
+
+	reg_value = data;
+	if (gmac4)
+		reg_value <<= GMAC4_PTP_SSIR_SSINC_SHIFT;
+
+	writel(reg_value, ioaddr + PTP_SSIR);
+
+	if (ssinc)
+		*ssinc = data;
+}
+
+static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
+{
+	u32 value;
+
+	writel(sec, ioaddr + PTP_STSUR);
+	writel(nsec, ioaddr + PTP_STNSUR);
+	/* issue command to initialize the system time value */
+	value = readl(ioaddr + PTP_TCR);
+	value |= PTP_TCR_TSINIT;
+	writel(value, ioaddr + PTP_TCR);
+
+	/* wait for present system time initialize to complete */
+	return readl_poll_timeout_atomic(ioaddr + PTP_TCR, value,
+					 !(value & PTP_TCR_TSINIT), 10, 100000);
+}
+
+static int config_addend(void __iomem *ioaddr, u32 addend)
+{
+	u32 value;
+	int limit;
+
+	writel(addend, ioaddr + PTP_TAR);
+	/* issue command to update the addend value */
+	value = readl(ioaddr + PTP_TCR);
+	value |= PTP_TCR_TSADDREG;
+	writel(value, ioaddr + PTP_TCR);
+
+	/* wait for present addend update to complete */
+	limit = 10;
+	while (limit--) {
+		if (!(readl(ioaddr + PTP_TCR) & PTP_TCR_TSADDREG))
+			break;
+		mdelay(10);
+	}
+	if (limit < 0)
+		return -EBUSY;
+
+	return 0;
+}
+
+static int adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec,
+			  int add_sub, int gmac4)
+{
+	u32 value;
+	int limit;
+
+	if (add_sub) {
+		/* If the new sec value needs to be subtracted with
+		 * the system time, then MAC_STSUR reg should be
+		 * programmed with (2^32 – <new_sec_value>)
+		 */
+		if (gmac4)
+			sec = -sec;
+
+		value = readl(ioaddr + PTP_TCR);
+		if (value & PTP_TCR_TSCTRLSSR)
+			nsec = (PTP_DIGITAL_ROLLOVER_MODE - nsec);
+		else
+			nsec = (PTP_BINARY_ROLLOVER_MODE - nsec);
+	}
+
+	writel(sec, ioaddr + PTP_STSUR);
+	value = (add_sub << PTP_STNSUR_ADDSUB_SHIFT) | nsec;
+	writel(value, ioaddr + PTP_STNSUR);
+
+	/* issue command to initialize the system time value */
+	value = readl(ioaddr + PTP_TCR);
+	value |= PTP_TCR_TSUPDT;
+	writel(value, ioaddr + PTP_TCR);
+
+	/* wait for present system time adjust/update to complete */
+	limit = 10;
+	while (limit--) {
+		if (!(readl(ioaddr + PTP_TCR) & PTP_TCR_TSUPDT))
+			break;
+		mdelay(10);
+	}
+	if (limit < 0)
+		return -EBUSY;
+
+	return 0;
+}
+
+static void get_systime(void __iomem *ioaddr, u64 *systime)
+{
+	u64 ns, sec0, sec1;
+
+	/* Get the TSS value */
+	sec1 = readl_relaxed(ioaddr + PTP_STSR);
+	do {
+		sec0 = sec1;
+		/* Get the TSSS value */
+		ns = readl_relaxed(ioaddr + PTP_STNSR);
+		/* Get the TSS value */
+		sec1 = readl_relaxed(ioaddr + PTP_STSR);
+	} while (sec0 != sec1);
+
+	if (systime)
+		*systime = ns + (sec1 * 1000000000ULL);
+}
+
+static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time)
+{
+	u64 ns;
+
+	ns = readl(ptpaddr + PTP_ATNR);
+	ns += readl(ptpaddr + PTP_ATSR) * NSEC_PER_SEC;
+
+	*ptp_time = ns;
+}
+
+const struct dn200_hwtimestamp dn200_ptp = {
+	.config_hw_tstamping = config_hw_tstamping,
+	.init_systime = init_systime,
+	.config_sub_second_increment = config_sub_second_increment,
+	.config_addend = config_addend,
+	.adjust_systime = adjust_systime,
+	.get_systime = get_systime,
+	.get_ptptime = get_ptptime,
+};
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_iatu.c b/drivers/net/ethernet/dapustor/dn200/dn200_iatu.c
new file mode 100644
index 000000000000..53fcedd6b133
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_iatu.c
@@ -0,0 +1,855 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include "dn200.h"
+#include "common.h"
+#include "dn200_iatu.h"
+#include "dn200_reg.h"
+
+#define DN200_IATU_BAR_OFFSET 0x20000
+
+static inline int dn200_iatu_cfg_get(struct dn200_priv *priv, u8 *iatu_base,
+				     u8 *iatu_num)
+{
+	int idx = 0;
+	u8 iatu_base_idx = 0, fun_iatu_num = 0;
+	struct plat_dn200_data *plat_ex = priv->plat_ex;
+	u8 pf_id;
+
+	if (!priv->plat_ex)
+		return -EINVAL;
+
+	pf_id = priv->plat_ex->pf_id;
+	/* add previous pfs iatu num to calculate current pf iatu base */
+	for (idx = 0; idx < pf_id; idx++) {
+		iatu_base_idx += priv->plat_ex->pf_max_iatu[idx];
+		iatu_base_idx += priv->plat_ex->vf_total_iatu[idx];
+	}
+
+	if (PRIV_IS_VF(priv)) {
+		struct dn200_vf_info info;
+
+		dn200_get_vf_queue_info(plat_ex->pf.ioaddr, &info, pf_id,
+					plat_ex->vf_offset);
+		fun_iatu_num = info.iatu_num;
+		/* add current pf iatu num to get vf iatu base */
+		iatu_base_idx += priv->plat_ex->pf_max_iatu[pf_id];
+		/* add previous vfs iatu num to calculate current vf iatu base */
+		for (idx = 0; idx < plat_ex->vf_offset; idx++) {
+			dn200_get_vf_queue_info(plat_ex->pf.ioaddr, &info,
+						pf_id, idx);
+			iatu_base_idx += info.iatu_num;
+		}
+		dev_dbg(priv->device,
+			"%s, %d, pf_id %d, vf_offset:%d, iatu_base_idx:%d, fun_iatu_num:%d!!\n",
+			__func__, __LINE__, pf_id, plat_ex->vf_offset,
+			iatu_base_idx, fun_iatu_num);
+	} else {
+		fun_iatu_num = priv->plat_ex->pf_max_iatu[pf_id];
+		dev_dbg(priv->device,
+			"%s, %d, pf_id %d, vf_offset:%d, iatu_base_idx:%d, fun_iatu_num:%d!!\n",
+			__func__, __LINE__, pf_id, plat_ex->vf_offset,
+			iatu_base_idx, fun_iatu_num);
+	}
+
+	if (iatu_base_idx < 0 || iatu_base_idx >= DN200_MAX_IATU_TBL_SIZE) {
+		dev_err(priv->device,
+			"%s, %d, iatu_base:%d is invalid, pls chk pf_max_iatu & vf_total_iatu!!\n",
+			__func__, __LINE__, iatu_base_idx);
+		return -EINVAL;
+	}
+
+	if (fun_iatu_num < 1 || fun_iatu_num > DN200_MAX_IATU_TBL_SIZE) {
+		dev_err(priv->device, "%s, %d, iatu num %d is invalid!!\n",
+			__func__, __LINE__, fun_iatu_num);
+		return -EINVAL;
+	}
+	*iatu_base = iatu_base_idx;
+	*iatu_num = fun_iatu_num;
+
+	return 0;
+}
+
+void dn200_axi_init_for_raid(struct dn200_priv *priv)
+{
+	u32 pf_axi_base_addr = 0;
+	int i;
+
+	/* to solve iatu base addr(32~40 bit:0xE0~0xFF) conflict with raid dma addr,
+	 * should set 48th bit of 64bit dma address as 1 to deal with it,
+	 * but xgmac just support 40 bit, so used axi bus register to
+	 * set 48th bit dma addr as fixed 1 before route to pcie
+	 * notes:
+	 * 1. one xgmac have two axi addr (read & write) register,
+	 *    every addr reg length is 0x4
+	 * 2. xgmac 0 start from 0x20000344, other xgmac address is continuous from it
+	 * 3. just append highest 24 bits to 40 bits dma address from xgamc to
+	 *    compose 64 bit dma address
+	 * 4. dma engine send out with 40 bits dma addr
+	 *     -> axi regiter append highest 24bits to gen 64 bits addr
+	 *     -> pcie receive 64 bits addr and iatu translate
+	 *        highest 32 bit base address to target addr
+	 */
+	if (priv->plat_ex->raid_supported && !PRIV_IS_VF(priv)) {
+		pf_axi_base_addr = DN200_AXI_HIGH_ADDR_REG_BASE +
+					(priv->plat_ex->pf_id * 2) * 0x4;
+		for (i = 0; i < 2; i++)
+			fw_reg_write(&priv->plat_ex->ctrl,
+				pf_axi_base_addr + (0x4 * i), DN200_AXI_HIGH_24BIT_VAL);
+	}
+}
+
+void dn200_axi_uninit_for_raid(struct dn200_priv *priv)
+{
+	u32 pf_axi_base_addr = 0;
+	int i;
+
+	if (priv->plat_ex->raid_supported && !PRIV_IS_VF(priv)) {
+		/* clear current pf axi high 24 bits address */
+		pf_axi_base_addr = DN200_AXI_HIGH_ADDR_REG_BASE +
+					(priv->plat_ex->pf_id * 2) * 0x4;
+		for (i = 0; i < 2; i++)
+			fw_reg_write(&priv->plat_ex->ctrl,
+				pf_axi_base_addr + (0x4 * i), 0);
+	}
+}
+
+static int dn200_iatu_tbl_init(struct dn200_priv *priv)
+{
+	int i;
+	u8 pf_id = priv->plat_ex->pf_id;
+	u8 max_iatu = 0;
+	u64 iatu_base_addr = 0;
+
+	struct dn200_func_iatu_basic_info *basic_info =
+	    &priv->iatu_info.basic_info;
+	struct dn200_priv_iatu_map *iatu_dma32 = &priv->iatu_info.dma32_info;
+	struct dn200_iatu_tbl_entry *iatu_entry;
+	u8 iatu_start_idx = 0;
+
+	if (dn200_iatu_cfg_get(priv, &iatu_start_idx, &max_iatu) < 0)
+		return -EINVAL;
+	dev_dbg(priv->device,
+		"%s, %d, pf:%d, iatu_start_idx:%d, max_iatu:%d.\n", __func__,
+		__LINE__, pf_id, iatu_start_idx, max_iatu);
+
+	/* iatu base addr used in two conditions for translating
+	 *  desc dma addr ((base addr | iatu id) | (lowest 32bit addr))
+	 *  to actual pcie target dma addr (target-dma addr high 32bit | lowest 32bit addr),
+	 *  one iatu cover 4GB addr space
+	 * 1. configured to pcie iatu table as region base
+	 * 2. used as highest 8bit of dma addr of
+	 *    ring descriptor(base addr | (dma addr & 0xffffffff)
+	 */
+	if (!priv->plat_ex->raid_supported)
+		iatu_base_addr = DN200_BASE_IATU_ADDR;
+	else
+		iatu_base_addr = DN200_RAID_BASE_IATU_ADDR;
+
+
+	for (i = iatu_start_idx; i < iatu_start_idx + max_iatu; i++) {
+		if (i >= DN200_MAX_IATU_TBL_SIZE) {
+			dev_err(priv->device,
+				"%s, %d, iatu index:%d exceed max iatu tbl size:%d, iatu start:%d, max:%d.\n",
+				__func__, __LINE__, i, DN200_MAX_IATU_TBL_SIZE,
+				iatu_start_idx, max_iatu);
+			break;
+		}
+
+		iatu_entry = &basic_info->tbl[i];
+		iatu_entry->tgt_addr = 0;
+		iatu_entry->base_addr =
+		    ((iatu_base_addr >> MAX_LIMIT_RANGE_SHIFT) + i)
+				<< MAX_LIMIT_RANGE_SHIFT;
+		iatu_entry->iatu_offset = i;
+		iatu_entry->limit_mask = LIMIT_MASK;
+		/* dma32 iatu always use max limit mask */
+		if (priv->dma32_iatu_used && i == iatu_dma32->iatu_index)
+			iatu_entry->limit_mask = MAX_LIMIT_MASK;
+		iatu_entry->pf_id = pf_id;
+
+		if (PRIV_IS_VF(priv)) {
+			iatu_entry->is_vf = true;
+			iatu_entry->vf_offset = priv->plat_ex->vf_offset;
+		} else {
+			iatu_entry->is_vf = false;
+			iatu_entry->vf_offset = 0;
+		}
+		dev_dbg(priv->device,
+			"%s, %d, tgt:%#llx, base:%#llx, region:%d, limit:%#llx, pf:%d, is_vf:%d, vf id:%d.\n",
+			__func__, __LINE__, iatu_entry->tgt_addr,
+			iatu_entry->base_addr, iatu_entry->iatu_offset,
+			iatu_entry->limit_mask, iatu_entry->pf_id,
+			iatu_entry->is_vf, iatu_entry->vf_offset);
+
+		dn200_iatu_tbl_entry_write(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+					   iatu_entry, i, true);
+	}
+
+	/* update dma32 target address that will not be changed */
+	if (priv->dma32_iatu_used)
+		dn200_iatu_tgt_addr_updt(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+					 iatu_dma32->iatu_index,
+					 iatu_dma32->target_addr);
+	return 0;
+}
+
+static void dn200_reclaim_queue_iatu(struct dn200_queue_iatu_info *queue_info)
+{
+	int i = 0;
+	struct list_head *entry, *tmp;
+
+	/*clean cached target to avoid reuse after freed */
+	queue_info->cached.cached_target = 0;
+	for (; i < DN200_MAX_IATU_MAP_PER_QUEUE; i++) {
+		struct iatu_map_info *info = &queue_info->map_info[i];
+
+		list_for_each_safe(entry, tmp, &info->list) {
+			struct iatu_con_hashmap *con_info =
+			    container_of(entry, struct iatu_con_hashmap, node);
+
+			/* rx iatu exist forever and share with tx, no need to update and remove */
+			if (con_info && !con_info->is_rx
+			    && !atomic_read(&con_info->ref_count)) {
+				atomic_sub(1, con_info->global_ref_ptr);
+				list_del(&con_info->node);
+				kfree(con_info);
+			}
+		}
+	}
+}
+
+static void dn200_tx_iatu_reclaim(struct dn200_priv *priv, u32 queue_index)
+{
+	int i = 0;
+	/* recycle iatu from all tx queues */
+	for (; i < priv->plat->tx_queues_to_use; i++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[i];
+
+		if (queue_index != i) {
+			if (likely(atomic_wait_trysch(&tx_q->tx_scheduling))) {
+				dn200_reclaim_queue_iatu(&tx_q->iatu_info);
+				atomic_free_sch(&tx_q->tx_scheduling);
+			}
+		} else {
+			dn200_reclaim_queue_iatu(&tx_q->iatu_info);
+		}
+	}
+}
+
+static inline int dn200_tx_iatu_alloc(struct dn200_priv *priv,
+				      u64 tar_addr, u32 queue_index,
+				      int *tx_map_index)
+{
+	struct dn200_priv_iatu_map *info;
+	int i = 0;
+	int unused_index = -1;
+	bool need_update = false;
+	bool retryed = false;
+	u64 hw_tgt_addr;
+
+	/*alloc iatu region from tx region */
+	if (!spin_trylock_bh(&priv->iatu_info.tx_lock))
+		return unused_index;
+
+retry:
+	for (i = 0; i < priv->iatu_info.basic_info.max_tx_iatu_num; i++) {
+		info = &priv->iatu_info.tx_info[i];
+		if (info->target_addr == tar_addr) {
+			atomic_inc(&info->global_ref);
+			need_update = false;
+			unused_index = info->iatu_index;
+			*tx_map_index = i;
+			goto unlock;
+		} else if (!atomic_read(&info->global_ref) && !need_update) {
+			need_update = true;
+			unused_index = info->iatu_index;
+			*tx_map_index = i;
+		}
+	}
+	if (need_update) {
+		info = &priv->iatu_info.tx_info[*tx_map_index];
+		info->target_addr = tar_addr;
+		atomic_inc(&info->global_ref);
+		hw_tgt_addr = tar_addr << CMP_ADDR_SHIFT;
+		priv->swc.tx_iatu_updt_cnt++;
+		dn200_iatu_tgt_addr_updt(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+					 unused_index, hw_tgt_addr);
+	} else if (unused_index < 0 && !retryed) {
+		/*no avalid iatu map, try reclaim iatu resources from all tx_queues */
+		dn200_tx_iatu_reclaim(priv, queue_index);
+		priv->swc.tx_iatu_recyc_cnt++;
+		retryed = true;
+		goto retry;
+	}
+unlock:
+	spin_unlock_bh(&priv->iatu_info.tx_lock);
+	return unused_index;
+}
+
+static inline int dn200_rx_iatu_match(struct dn200_priv *priv,
+				      u64 tar_addr, int *rx_map_index)
+{
+	struct dn200_priv_iatu_map *info;
+	int i = 0;
+	int unused_index = -1;
+	bool need_update = false;
+	u64 hw_tgt_addr;
+
+	for (i = 0; i < priv->iatu_info.basic_info.max_rx_iatu_num; i++) {
+		info = &priv->iatu_info.rx_info[i];
+		if (info->target_addr == tar_addr) {
+			need_update = false;
+			unused_index = info->iatu_index;
+			*rx_map_index = i;
+			goto unlock;
+		} else if (!info->target_addr && !need_update) {
+			need_update = true;
+			unused_index = info->iatu_index;
+			*rx_map_index = i;
+		}
+	}
+	if (need_update) {
+		info = &priv->iatu_info.rx_info[*rx_map_index];
+		info->target_addr = tar_addr;
+		info->is_rx = true;
+		hw_tgt_addr = tar_addr << CMP_ADDR_SHIFT;
+		dev_dbg(priv->device,
+			"%s %d update tgt addr:%#llx, glob ref:%d, unused_index:%d, info:%p, ref:%p\n",
+			__func__, __LINE__, hw_tgt_addr,
+			atomic_read(&info->global_ref), unused_index, &info,
+			&info->global_ref);
+		dn200_iatu_tgt_addr_updt(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+					 unused_index, hw_tgt_addr);
+	}
+unlock:
+	dev_dbg(priv->device, "%s, %d, tar_addr:%#llx, unused_index:%d.\n",
+		__func__, __LINE__, tar_addr, unused_index);
+	return unused_index;
+}
+
+static inline bool dn200_tx_iatu_match(struct dn200_tx_queue *tx_q,
+				       u64 tar_addr, atomic_t **iatu_ref_ptr,
+				       u64 *base_addr)
+{
+	u32 hash;
+	struct list_head *entry, *tmp;
+	struct iatu_map_info *hash_map;
+	struct dn200_queue_iatu_info *queue_info = &tx_q->iatu_info;
+	struct dn200_func_iatu_basic_info *basic_info =
+	    &tx_q->priv_data->iatu_info.basic_info;
+
+	hash = hash_32((u32) tar_addr, DN200_MAX_IATU_MAP_SHIFT);
+	hash_map = &queue_info->map_info[hash];
+
+	atomic_wait_sch(&tx_q->tx_scheduling);
+	list_for_each_safe(entry, tmp, &hash_map->list) {
+		struct iatu_con_hashmap *con_info =
+		    container_of(entry, struct iatu_con_hashmap, node);
+
+		if (con_info && con_info->target_addr == tar_addr) {
+			if (!con_info->is_rx) {
+				atomic_inc(&con_info->ref_count);
+				*iatu_ref_ptr = &con_info->ref_count;
+				queue_info->cached.ref_count_ptr =
+				    &con_info->ref_count;
+			}
+
+			*base_addr =
+			    basic_info->tbl[con_info->iatu_index].base_addr;
+			atomic_free_sch(&tx_q->tx_scheduling);
+			return true;
+		}
+	}
+	atomic_free_sch(&tx_q->tx_scheduling);
+
+	return false;
+}
+
+static inline bool dn200_tx_iatu_local_reuse(struct dn200_priv *priv,
+					     struct dn200_queue_iatu_info
+					     *queue_info, u64 tar_addr,
+					     atomic_t **iatu_ref_ptr)
+{
+	int i = 0;
+	struct list_head *entry, *tmp;
+	bool find = false;
+	struct dn200_priv_iatu_map *global_iatu;
+
+	/*clean cached target to avoid reuse after freed */
+	queue_info->cached.cached_target = 0;
+	/*operate global iatu region, lock global */
+
+	if (!spin_trylock_bh(&priv->iatu_info.tx_lock))
+		return false;
+
+	for (; i < DN200_MAX_IATU_MAP_PER_QUEUE; i++) {
+		struct iatu_map_info *info = &queue_info->map_info[i];
+
+		list_for_each_safe(entry, tmp, &info->list) {
+			struct iatu_con_hashmap *con_info =
+			    container_of(entry, struct iatu_con_hashmap, node);
+
+			/*find a iatu map which only used by self */
+			if (con_info && !atomic_read(&con_info->ref_count)
+			    && (atomic_read(con_info->global_ref_ptr) == 1)) {
+				global_iatu =
+				    container_of(con_info->global_ref_ptr,
+						struct dn200_priv_iatu_map,
+						global_ref);
+
+				*iatu_ref_ptr = &con_info->ref_count;
+				atomic_inc(&con_info->ref_count);
+				/*update iatu map's target addr */
+				con_info->target_addr = tar_addr;
+				global_iatu->target_addr = tar_addr;
+				dn200_iatu_tgt_addr_updt(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+							 con_info->iatu_index,
+							 tar_addr);
+				find = true;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	spin_unlock_bh(&priv->iatu_info.tx_lock);
+	return find;
+}
+
+int dn200_tx_iatu_find(u64 tar_addr, struct dn200_tx_queue *tx_q,
+		       atomic_t **iatu_ref_ptr, u64 *base_addr)
+{
+	int res = 0, ret = 0, tx_map_idx = 0;
+	struct iatu_con_hashmap *con_info = NULL;
+	struct iatu_map_info *info = NULL;
+	struct dn200_priv *priv = tx_q->priv_data;
+	struct dn200_func_iatu_basic_info *basic_info =
+	    &priv->iatu_info.basic_info;
+
+	priv->swc.tx_iatu_find_cnt++;
+	*iatu_ref_ptr = NULL;
+
+	/*all high bits are zero, use default iatu region */
+	if (tar_addr >> MAX_LIMIT_RANGE_SHIFT == 0) {
+		*base_addr = tar_addr;
+		if (tx_q->priv_data->dma32_iatu_used) {
+			u16 dma32_iatu_idx =
+			    tx_q->priv_data->iatu_info.dma32_info.iatu_index;
+			*base_addr = basic_info->tbl[dma32_iatu_idx].base_addr;
+		}
+		return 0;
+	}
+
+	tar_addr = (tar_addr >> CMP_ADDR_SHIFT);
+	if (dn200_tx_iatu_match(tx_q, tar_addr, iatu_ref_ptr, base_addr)) {
+		priv->swc.tx_iatu_match_cnt++;
+		return 0;
+	}
+
+	atomic_wait_sch(&tx_q->tx_scheduling);
+	con_info = kzalloc(sizeof(struct iatu_con_hashmap), GFP_ATOMIC);
+	if (unlikely(!con_info)) {
+		dev_err(tx_q->priv_data->device, "%s %d alloc failure.\n",
+			__func__, __LINE__);
+		res = -ENOMEM;
+		goto free_sch;
+	}
+	ret =
+	    dn200_tx_iatu_alloc(tx_q->priv_data, tar_addr, tx_q->queue_index,
+				&tx_map_idx);
+	if (ret < 0) {
+		kfree(con_info);
+		res = -ENOSPC;
+		goto free_sch;
+	}
+	info =
+	    &tx_q->iatu_info.map_info[hash_32((u32) tar_addr, DN200_MAX_IATU_MAP_SHIFT)];
+	con_info->iatu_index = ret;
+	con_info->target_addr = tar_addr;
+	atomic_set(&con_info->ref_count, 0);
+	atomic_inc(&con_info->ref_count);
+	*iatu_ref_ptr = &con_info->ref_count;
+	con_info->global_ref_ptr =
+	    &tx_q->priv_data->iatu_info.tx_info[tx_map_idx].global_ref;
+    /*barrier to protect gloval_ref_ptr's value available*/
+	smp_mb();
+	list_add_tail(&con_info->node, &info->list);
+	*base_addr = basic_info->tbl[ret].base_addr;
+free_sch:
+	atomic_free_sch(&tx_q->tx_scheduling);
+	return res;
+}
+
+int dn200_rx_iatu_find(u64 tar_addr, struct dn200_priv *priv, u64 *base_addr)
+{
+	int res = 0, ret = 0, rx_map_idx = 0;
+	u64 dma_addr = 0;
+	struct dn200_func_iatu_basic_info *basic_info =
+	    &priv->iatu_info.basic_info;
+
+	if (dma_can_direct_use(priv, tar_addr)) {
+		*base_addr = tar_addr;
+		return 0;
+	}
+	/*all high bits are zero, use default iatu region */
+	if ((tar_addr >> MAX_LIMIT_RANGE_SHIFT) == 0) {
+		*base_addr = tar_addr;
+		if (priv->dma32_iatu_used) {
+			u16 dma32_iatu_idx =
+			    priv->iatu_info.dma32_info.iatu_index;
+
+			*base_addr = basic_info->tbl[dma32_iatu_idx].base_addr | (tar_addr & ((u64) BIT(32) - 1));
+		}
+		return 0;
+	}
+	dma_addr = (tar_addr >> CMP_ADDR_SHIFT);
+
+	ret = dn200_rx_iatu_match(priv, dma_addr, &rx_map_idx);
+	if (ret < 0) {
+		dev_dbg(priv->device,
+			"%s %d dma_addr %#llx alloc iatu failed\n", __func__,
+			__LINE__, dma_addr);
+		res = -ENOSPC;
+		goto free_sch;
+	}
+	*base_addr = basic_info->tbl[ret].base_addr | (tar_addr & LIMIT_MASK);
+	pr_debug("%s, %d, tar_addr %#llx base addr:%#llx\n", __func__, __LINE__,
+		 tar_addr, *base_addr);
+
+free_sch:
+	return res;
+}
+
+static void dn200_display_queue_iatu(struct dn200_priv *priv,
+				     struct dn200_queue_iatu_info *queue_info,
+				     struct seq_file *seq)
+{
+	int i = 0;
+	struct list_head *entry, *tmp;
+
+	for (; i < DN200_MAX_IATU_MAP_PER_QUEUE; i++) {
+		struct iatu_map_info *info = &queue_info->map_info[i];
+
+		list_for_each_safe(entry, tmp, &info->list) {
+			struct iatu_con_hashmap *con_info =
+			    container_of(entry, struct iatu_con_hashmap, node);
+
+			if (!con_info->is_rx) {
+				seq_printf(seq,
+					   "  index %d, iatu index:%d, tar_addr %llx, local ref_cnt %d, global ref %d\n",
+					   i, con_info->iatu_index,
+					   con_info->target_addr,
+					   atomic_read(&con_info->ref_count),
+					   atomic_read(con_info->global_ref_ptr));
+			}
+		}
+	}
+}
+
+void dn200_iatu_display(struct dn200_priv *priv, struct seq_file *seq)
+{
+	struct dn200_priv_iatu_map *info;
+	int i = 0;
+	u16 total_iatu = priv->iatu_info.basic_info.max_tx_iatu_num +
+	    priv->iatu_info.basic_info.max_rx_iatu_num;
+
+	if (priv->dma32_iatu_used)
+		total_iatu++;
+
+	seq_printf(seq, "====== fun:%d, total iatu:%d ======\n",
+		   priv->plat_ex->funcid, total_iatu);
+
+	seq_printf(seq, "=== tx iatu total:%d ===\n",
+		   priv->iatu_info.basic_info.max_tx_iatu_num);
+	for (i = 0; i < priv->iatu_info.basic_info.max_tx_iatu_num; i++) {
+		info = &priv->iatu_info.tx_info[i];
+		seq_printf(seq,
+			   "index %d, iatu index:%d, target_addr %llx, global_ref %d, from rx:%s\n",
+			   i, info->iatu_index, info->target_addr,
+			   atomic_read(&info->global_ref),
+			   info->is_rx ? "yes" : "no");
+	}
+
+	seq_printf(seq, "=== rx iatu total:%d ===\n",
+		   priv->iatu_info.basic_info.max_rx_iatu_num);
+	for (i = 0; i < priv->iatu_info.basic_info.max_rx_iatu_num; i++) {
+		info = &priv->iatu_info.rx_info[i];
+		seq_printf(seq, "index %d, iatu index:%d, target_addr %llx\n",
+			   i, info->iatu_index, info->target_addr);
+	}
+
+	seq_printf(seq, "=== dma32 iatu used:%d, iatu index:%d ===\n",
+		   priv->dma32_iatu_used,
+		   priv->iatu_info.dma32_info.iatu_index);
+
+	seq_puts(seq, "=== tx iatu per queue: ===\n");
+	for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[i];
+
+		if (likely(atomic_wait_trysch(&tx_q->tx_scheduling))) {
+			seq_printf(seq, "%s queue index %d :\n", __func__, i);
+			dn200_display_queue_iatu(priv, &tx_q->iatu_info, seq);
+			atomic_free_sch(&tx_q->tx_scheduling);
+		}
+	}
+}
+
+static int dn200_add_rx_iatu2tx_queue(struct dn200_priv *priv, u8 queue_id,
+				      u8 *rx_num)
+{
+	int i = 0;
+	struct dn200_priv_iatu_map *info;
+	struct iatu_map_info *list_info;
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue_id];
+
+	for (i = 0; i < priv->iatu_info.basic_info.max_rx_iatu_num; i++) {
+		info = &priv->iatu_info.rx_info[i];
+		if (info->target_addr) {
+			struct iatu_con_hashmap *con_info =
+			    kzalloc(sizeof(struct iatu_con_hashmap),
+				    GFP_ATOMIC);
+
+			if (unlikely(!con_info)) {
+				dev_err(priv->device, "%s %d alloc failure.\n",
+					__func__, __LINE__);
+				return 0;
+			}
+
+			list_info =
+			    &tx_q->iatu_info.map_info[hash_32((u32) info->target_addr, DN200_MAX_IATU_MAP_SHIFT)];
+			con_info->iatu_index = info->iatu_index;
+			con_info->target_addr = info->target_addr;
+			con_info->is_rx = true;
+			dev_dbg(priv->device,
+				"%s %d iatu index %d target_addr %#llx queue_id %d\n",
+				__func__, __LINE__, info->iatu_index,
+				info->target_addr, queue_id);
+			/*barrier to protect con_info value available*/
+			smp_mb();
+			list_add_tail(&con_info->node, &list_info->list);
+		} else {
+			*rx_num = i;
+			break;
+		}
+	}
+	return 0;
+}
+
+int dn200_add_rx_iatu2tx(struct dn200_priv *priv)
+{
+	int i = 0;
+	u8 rx_num = 0;
+	u8 unused_num = 0;
+	struct dn200_priv_iatu_info *iatu_info = &priv->iatu_info;
+	struct dn200_priv_iatu_map *tx_iatu_map;
+	struct dn200_priv_iatu_map *rx_iatu_map;
+	int max_rx_iatu_num = priv->iatu_info.basic_info.max_rx_iatu_num;
+	int max_tx_iatu_num = priv->iatu_info.basic_info.max_tx_iatu_num;
+
+	rx_num = max_rx_iatu_num;
+	for (; i < priv->plat->tx_queues_to_use; i++) {
+		if (dn200_add_rx_iatu2tx_queue(priv, i, &rx_num))
+			return 0;
+	}
+	unused_num = max_rx_iatu_num - rx_num;
+	if (unused_num) {
+		dev_dbg(priv->device, "%s %d rx use %d put unused %d to tx\n",
+			__func__, __LINE__, rx_num, unused_num);
+		i = 0;
+		for (; i < unused_num; i++) {
+			tx_iatu_map = &iatu_info->tx_info[max_tx_iatu_num + i];
+			rx_iatu_map = &iatu_info->rx_info[rx_num + i];
+			tx_iatu_map->iatu_index = rx_iatu_map->iatu_index;
+			tx_iatu_map->is_rx = true;
+			dev_dbg(priv->device,
+				"%s %d addr rx index %d iatu_index %d to tx_index %d\n",
+				__func__, __LINE__, rx_num + i,
+				rx_iatu_map->iatu_index, max_tx_iatu_num + i);
+		}
+
+		priv->iatu_info.basic_info.max_tx_iatu_num += unused_num;
+		priv->iatu_info.basic_info.max_rx_iatu_num -= unused_num;
+	}
+
+	return 0;
+}
+
+int dn200_iatu_init(struct dn200_priv *priv)
+{
+	int i = 0, j = 0;
+	u8 max_iatu = 0;
+	u8 dma32_iatu_base = 0, rx_iatu_base = 0, tx_iatu_base = 0;
+	struct dn200_priv_iatu_info *iatu_info = &priv->iatu_info;
+	struct dn200_func_iatu_basic_info *basic_info = &iatu_info->basic_info;
+	struct dn200_priv_iatu_map *iatu_map;	/*global iatu map info per function */
+	struct dn200_queue_iatu_info *queue_iatu;	/*iatu map info per queue */
+	u8 max_rx_iatu, max_tx_iatu;
+
+	basic_info->max_tx_iatu_num = 0;
+	basic_info->max_rx_iatu_num = 0;
+	memset(iatu_info, 0, sizeof(struct dn200_priv_iatu_info));
+	spin_lock_init(&iatu_info->rx_lock);
+	spin_lock_init(&iatu_info->tx_lock);
+
+	if (dn200_iatu_cfg_get(priv, &tx_iatu_base, &max_iatu) < 0)
+		return -EINVAL;
+
+	max_tx_iatu = (max_iatu >> 1);
+	max_tx_iatu = min_t(u8, max_tx_iatu, (u8) DN200_MAX_IATU_MAP_TX);
+	max_rx_iatu = (max_iatu - max_tx_iatu - 1);
+	max_rx_iatu = min_t(u8, max_rx_iatu, (u8) DN200_MAX_IATU_MAP_RX);
+
+	/* if pf support raid or it is vf, should set dma32 iatu
+	 * to solve address confict between raid and pf/vf
+	 * the iatu is also used for vf outband address routing
+	 */
+	if (priv->plat_ex->raid_supported || PRIV_IS_VF(priv))
+		priv->dma32_iatu_used = true;
+
+	/* for pf without raid, no need to reserve dma32 iatu, just use it for tx */
+	if (!priv->dma32_iatu_used)
+		max_tx_iatu += 1;
+
+	basic_info->max_tx_iatu_num = max_tx_iatu;
+	basic_info->max_rx_iatu_num = max_rx_iatu;
+	/* tx_iatu_base ~ (tx_iatu_base + max_tx_iatu - 1) used by tx queues */
+	for (i = 0; i < max_tx_iatu; i++) {
+		iatu_map = &iatu_info->tx_info[i];
+		iatu_map->iatu_index = i + tx_iatu_base;
+		iatu_map->target_addr = 0;
+		atomic_set(&iatu_map->global_ref, 0);
+		dev_dbg(priv->device,
+			"%s, %d, tx i:%d, iatu_index:%d, max_tx_iatu:%d, tx_iatu_base:%d!\n",
+			__func__, __LINE__, i, iatu_map->iatu_index,
+			max_tx_iatu, tx_iatu_base);
+	}
+
+	/* iatu index (rx_iatu_base) ~ (rx_iatu_base + rx_iatu_base - 1) used by rx queues */
+	rx_iatu_base = tx_iatu_base + max_tx_iatu;
+	for (i = 0; i < max_rx_iatu; i++) {
+		iatu_map = &iatu_info->rx_info[i];
+		iatu_map->iatu_index = i + rx_iatu_base;
+		iatu_map->target_addr = 0;
+		atomic_set(&iatu_map->global_ref, 0);
+		dev_dbg(priv->device,
+			"%s, %d, rx i:%d, iatu_index:%d, max_rx_iatu:%d, rx_iatu_base:%d!\n",
+			__func__, __LINE__, i, iatu_map->iatu_index,
+			max_rx_iatu, rx_iatu_base);
+	}
+
+	/* last iatu index used by default dma32 in two conditions: for vf or pf support raid */
+	if (priv->dma32_iatu_used) {
+		dma32_iatu_base = rx_iatu_base + max_rx_iatu;
+		iatu_info->dma32_info.iatu_index = dma32_iatu_base;
+		iatu_info->dma32_info.target_addr = 0;
+		dev_dbg(priv->device,
+			"%s, %d, dma32_iatu_base or iatu_index:%d!\n", __func__,
+			__LINE__, iatu_info->dma32_info.iatu_index);
+	}
+
+	for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+		queue_iatu = &priv->tx_queue[i].iatu_info;
+		for (j = 0; j < DN200_MAX_IATU_MAP_PER_QUEUE; j++) {
+			struct iatu_map_info *map_info =
+			    &queue_iatu->map_info[j];
+
+			INIT_LIST_HEAD(&map_info->list);
+		}
+	}
+
+	for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
+		queue_iatu = &priv->rx_queue[i].iatu_info;
+		for (j = 0; j < DN200_MAX_IATU_MAP_PER_QUEUE; j++) {
+			struct iatu_map_info *map_info =
+			    &queue_iatu->map_info[j];
+
+			INIT_LIST_HEAD(&map_info->list);
+		}
+	}
+
+	if (dn200_iatu_tbl_init(priv) < 0)
+		return -EINVAL;
+
+	set_bit(DN200_IATU_INIT, &priv->state);
+	return 0;
+}
+
+static void dn200_queue_iatu_free(struct dn200_priv *priv,
+				  struct dn200_queue_iatu_info *queue_info)
+{
+	int i = 0;
+	struct list_head *entry, *tmp;
+
+	for (; i < DN200_MAX_IATU_MAP_PER_QUEUE; i++) {
+		struct iatu_map_info *info = &queue_info->map_info[i];
+
+		if (list_empty(&info->list) || !info->list.next)
+			continue;
+		list_for_each_safe(entry, tmp, &info->list) {
+			struct iatu_con_hashmap *con_info =
+			    container_of(entry, struct iatu_con_hashmap, node);
+			if (con_info) {
+				list_del(&con_info->node);
+				kfree(con_info);
+			}
+		}
+		INIT_LIST_HEAD(&info->list);
+	}
+	memset(queue_info->map_info, 0, sizeof(queue_info->map_info));
+}
+
+static int dn200_iatu_tbl_clear(struct dn200_priv *priv)
+{
+	int i;
+	u8 pf_id = priv->plat_ex->pf_id;
+	u8 max_iatu = 0;
+	struct dn200_iatu_tbl_entry iatu_entry;
+	u8 iatu_start_idx = 0;
+
+	if (dn200_iatu_cfg_get(priv, &iatu_start_idx, &max_iatu) < 0)
+		return -EINVAL;
+
+	for (i = iatu_start_idx; i < iatu_start_idx + max_iatu; i++) {
+		if (i >= DN200_MAX_IATU_TBL_SIZE) {
+			dev_err(priv->device,
+					"%s, %d, iatu index:%d exceed max iatu tbl size:%d, iatu start:%d, max:%d.\n",
+					__func__, __LINE__, i, DN200_MAX_IATU_TBL_SIZE,
+					iatu_start_idx, max_iatu);
+			break;
+		}
+
+		iatu_entry.tgt_addr = 0;
+		iatu_entry.base_addr = 0;
+		iatu_entry.iatu_offset = i;
+		iatu_entry.limit_mask = 0;
+		iatu_entry.pf_id = pf_id;
+		iatu_entry.is_vf = 0;
+		iatu_entry.vf_offset = 0;
+
+		dn200_iatu_tbl_entry_write(priv->ioaddr + DN200_IATU_BAR_OFFSET,
+									&iatu_entry, i, false);
+	}
+
+	return 0;
+}
+
+void dn200_iatu_uninit(struct dn200_priv *priv)
+{
+	int i = 0;
+	u8 tx_iatu_base, max_iatu;
+
+	if (dn200_iatu_cfg_get(priv, &tx_iatu_base, &max_iatu) < 0)
+		return;
+
+	for (; i < priv->plat->tx_queues_to_use; i++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[i];
+
+		atomic_wait_sch(&tx_q->tx_scheduling);
+		dn200_queue_iatu_free(priv, &tx_q->iatu_info);
+		atomic_free_sch(&tx_q->tx_scheduling);
+	}
+	dn200_iatu_tbl_clear(priv);
+	clear_bit(DN200_IATU_INIT, &priv->state);
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_iatu.h b/drivers/net/ethernet/dapustor/dn200/dn200_iatu.h
new file mode 100644
index 000000000000..b2a207121aad
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_iatu.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_IATU_H__
+#define __DN200_IATU_H__
+
+#include "common.h"
+
+#define DN200_MAX_IATU_TBL_SIZE_PER_FUNC  16
+#define DN200_MAX_IATU_MAP_SHIFT 4
+#define DN200_MAX_IATU_TBL_SIZE  32
+#define DN200_MAX_IATU_MAP_PER_QUEUE BIT(DN200_MAX_IATU_MAP_SHIFT)
+#define DN200_MAX_IATU_MAP_RX 7
+#define DN200_MAX_IATU_MAP_TX BIT(DN200_MAX_IATU_MAP_SHIFT - 1)
+#define DN200_BASE_IATU_ADDR 0x00E000000000ULL
+#define DN200_RAID_BASE_IATU_ADDR 0x80E000000000ULL
+#define DN200_AXI_HIGH_ADDR_REG_BASE 0x20000344
+#define DN200_AXI_HIGH_24BIT_VAL 0x000080
+
+/*record per iatu mapped info for a tx_q*/
+struct iatu_con_hashmap {
+	struct list_head node;
+	u16 iatu_index;
+	u64 target_addr;
+	atomic_t ref_count;
+	atomic_t *global_ref_ptr;
+	bool is_rx;
+};
+
+struct iatu_map_info {
+	struct list_head list;
+};
+
+struct iatu_cached_target {
+	u64 cached_target;
+	u16 iatu_index;
+	atomic_t *ref_count_ptr;
+	bool is_rx;
+};
+
+struct dn200_queue_iatu_info {
+	struct iatu_map_info map_info[DN200_MAX_IATU_MAP_PER_QUEUE];
+	struct iatu_cached_target cached;
+};
+
+struct dn200_iatu_tbl_entry {
+	u64 tgt_addr;		/* target address used as high address */
+	u64 base_addr;		/* base address used to compare */
+	u16 iatu_offset;	/*iatu global offset */
+	u64 limit_range;	/*limit_range maximum BIT(32) */
+	u64 limit_mask;		/*limit_mask maximum BIT(32) - 1 */
+	u8 pf_id;
+	u8 is_vf;
+	u8 vf_offset;
+};
+
+struct dn200_func_iatu_basic_info {
+	u16 max_tx_iatu_num;
+	u16 max_rx_iatu_num;
+	struct dn200_iatu_tbl_entry tbl[DN200_MAX_IATU_TBL_SIZE];
+};
+
+struct dn200_priv_iatu_map {
+	u16 iatu_index;
+	u64 target_addr;
+	atomic_t global_ref;
+	bool is_rx;
+};
+
+struct dn200_priv_iatu_info {
+	spinlock_t rx_lock; /* rx iatu lock */
+	spinlock_t tx_lock; /* tx iatu lock */
+	struct dn200_priv_iatu_map dma32_info;
+	struct dn200_priv_iatu_map rx_info[DN200_MAX_IATU_MAP_RX];
+	struct dn200_priv_iatu_map tx_info[DN200_MAX_IATU_MAP_TX +
+					   DN200_MAX_IATU_MAP_RX];
+	struct dn200_func_iatu_basic_info basic_info;
+};
+
+#define atomic_wait_sch(v) \
+do {} while (atomic_cmpxchg(v, 0, 1))
+
+#define atomic_wait_trysch(v) !atomic_cmpxchg(v, 0, 1)
+#define atomic_free_sch(v) atomic_set(v, 0)
+
+enum dn200_iatu_type {
+	IATU_TX,
+	IATU_RX,
+	IATU_DMA32,
+};
+
+#define DN200_IATU_BASE_ADDR_SET(addr, index)
+#define DN200_IATU_TAR_ADDR_SET(addr, index)
+#define MAX_LIMIT_RANGE_SHIFT 32
+#define MAX_LIMIT_RANGE_SIZE  BIT(MAX_LIMIT_RANGE_SHIFT)
+#define MAX_LIMIT_MASK (MAX_LIMIT_RANGE_SIZE - 1)
+
+/* e.g. change LIMIT_RANGE_SHIFT to 26 bit, each iatu just cover 64MB */
+#define LIMIT_RANGE_SHIFT 32	//MAX_LIMIT_RANGE_SHIFT
+#define LIMIT_RANGE_SIZE  BIT(LIMIT_RANGE_SHIFT)
+#define LIMIT_MASK (LIMIT_RANGE_SIZE - 1)
+#define CMP_ADDR_SHIFT (LIMIT_RANGE_SHIFT)
+
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_main.c b/drivers/net/ethernet/dapustor/dn200/dn200_main.c
new file mode 100644
index 000000000000..6da94878882e
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_main.c
@@ -0,0 +1,10515 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *This is the driver for the Dapustor DN200 10/100/1000/10G Ethernet controllers.
+ *
+ * Copyright (c) 2024 DapuStor Corporation.
+ */
+
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/mii.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <linux/pinctrl/consumer.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/pkt_sched.h>
+#include <net/vxlan.h>
+#include <net/udp_tunnel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/net_tstamp.h>
+#include <linux/udp.h>
+#include <linux/bpf_trace.h>
+#include <net/pkt_cls.h>
+#include "dn200_ptp.h"
+#include "dn200.h"
+#include <linux/reset.h>
+#include <linux/of_mdio.h>
+#include <linux/netdevice.h>
+
+#include "dwxgmac_comm.h"
+#include "hwif.h"
+#include "dn200_phy.h"
+#include "dn200_sriov.h"
+#include "dn200_cfg.h"
+#include "dn200_dcb.h"
+#include "dn200_pool.h"
+#include "dn200_eprom.h"
+
+/* As long as the interface is active, we keep the timestamping counter enabled
+ * with fine resolution and binary rollover. This avoid non-monotonic behavior
+ * (clock jumps) when changing timestamping settings at runtime.
+ */
+#define DN200_HWTS_ACTIVE (PTP_TCR_TSENA | PTP_TCR_TSCFUPDT | \
+							PTP_TCR_TSCTRLSSR)
+
+#define DN200_ALIGN(x) ALIGN(ALIGN(x, SMP_CACHE_BYTES), 16)
+#define TSO_MAX_BUFF_SIZE (SZ_16K - 1)
+
+/* Module parameters */
+#define TX_TIMEO 5000
+#define DN200_TX_THRESH(x) ((x)->dma_tx_size / 4)
+#define DN200_RX_THRESH(x) ((x)->dma_rx_size / 4)
+
+/* Limit to make sure XDP TX and slow path can coexist */
+#define DN200_XSK_TX_BUDGET_MAX 256
+#define DN200_TX_XSK_AVAIL 16
+#define DN200_RX_FILL_BATCH 32
+
+#define DN200_XDP_PASS 0
+#define DN200_XDP_CONSUMED BIT(0)
+#define DN200_XDP_TX BIT(1)
+#define DN200_XDP_REDIRECT BIT(2)
+
+/* default msg level always output to dmesg after driver probe
+ * if we don't want to output in default state, should not include them
+ * you can use ethtool -s enpxxx msglvl xxx to open
+ * NETIF_MSG_TX_QUEUED, NETIF_MSG_PKTDATA, NETIF_MSG_RX_STATUS, etc
+ */
+static const u32 default_msg_level = (NETIF_MSG_PROBE | NETIF_MSG_IFUP);
+
+#define DN200_DEFAULT_LPI_TIMER 1000
+#define DN200_LPI_T(x) (jiffies + usecs_to_jiffies(x))
+
+static irqreturn_t dn200_interrupt(int irq, void *dev_id);
+/* For MSI interrupts handling */
+static irqreturn_t dn200_mac_interrupt(int irq, void *dev_id);
+static irqreturn_t dn200_safety_interrupt(int irq, void *dev_id);
+static irqreturn_t dn200_msi_intr_tx(int irq, void *data);
+static irqreturn_t dn200_msi_intr_rx(int irq, void *data);
+static irqreturn_t dn200_msi_intr_rxtx(int irq, void *data);
+static void dn200_tx_timer_arm(struct dn200_priv *priv, u32 queue);
+static void dn200_flush_tx_descriptors(struct dn200_priv *priv, int queue);
+static void dn200_heartbeat(struct timer_list *t);
+static void dn200_disable_all_queues(struct dn200_priv *priv);
+static void dn200_xgmac_rx_ext_clk_set(struct dn200_priv *priv, bool external);
+
+static const struct net_device_ops dn200_netdev_ops;
+static const struct net_device_ops dn200_vf_netdev_ops;
+static void dn200_init_fs(struct net_device *dev);
+static void dn200_exit_fs(struct net_device *dev);
+
+static void dn200_napi_add(struct net_device *dev);
+static void dn200_napi_del(struct net_device *dev);
+static void dn200_vxlan_set(struct net_device *netdev)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u32 value;
+
+	writel(priv->vxlan_port, priv->ioaddr + XGMAC_TUNNEL_IDENTIFIER);
+	value = readl(priv->ioaddr + XGMAC_PACKET_FILTER);
+	value |= XGMAC_FILTER_VUCC;
+	writel(value, priv->ioaddr + XGMAC_PACKET_FILTER);
+
+	value = readl(priv->ioaddr + XGMAC_TX_CONFIG);
+	value &= ~XGMAC_CONFIG_VNM;	// VXLAN
+	value |= XGMAC_CONFIG_VNE;
+	writel(value, priv->ioaddr + XGMAC_TX_CONFIG);
+}
+
+static int dn200_vxlan_unset(struct net_device *netdev)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u32 value;
+
+	value = readl(priv->ioaddr + XGMAC_PACKET_FILTER);
+	value &= ~XGMAC_FILTER_VUCC;
+	writel(value, priv->ioaddr + XGMAC_PACKET_FILTER);
+
+	value = readl(priv->ioaddr + XGMAC_TX_CONFIG);
+	value &= ~(XGMAC_CONFIG_VNM | XGMAC_CONFIG_VNE);
+	writel(value, priv->ioaddr + XGMAC_TX_CONFIG);
+	priv->vxlan_port = 0;
+	writel(0, priv->ioaddr + XGMAC_TUNNEL_IDENTIFIER);
+	return 0;
+}
+
+#define DN200_COAL_TIMER(x) (ns_to_ktime((x) * NSEC_PER_USEC))
+#define DN200_POLL_TIMER(x) (ns_to_ktime((x) * NSEC_PER_MSEC))
+
+
+static int dn200_vxlan_set_port(struct net_device *netdev, unsigned int table,
+				unsigned int entry, struct udp_tunnel_info *ti)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u16 status = 0, flag;
+
+	priv->vxlan_port = be16_to_cpu(ti->port);
+	if (!HW_IS_PUREPF(priv->hw)) {
+		status = dn200_get_vxlan_status(priv->hw);
+		if (!status)
+			dn200_vxlan_set(netdev);
+		flag = PRIV_IS_VF(priv) ? (DN200_VF_OFFSET_GET(priv->hw) + 1) :
+					  0;
+		status = status | (1 << flag);
+		dn200_set_vxlan_status(priv->hw, status);
+	} else {
+		dn200_vxlan_set(netdev);
+	}
+	return 0;
+}
+
+static int dn200_vxlan_unset_port(struct net_device *netdev, unsigned int table,
+				  unsigned int entry,
+				  struct udp_tunnel_info *ti)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	u16 status = 0, flag;
+
+	priv->vxlan_port = 0;
+	if (!PRIV_IS_VF(priv))
+		dn200_vxlan_unset(netdev);
+	if (!HW_IS_PUREPF(priv->hw)) {
+		status = dn200_get_vxlan_status(priv->hw);
+		flag = PRIV_IS_VF(priv) ? (DN200_VF_OFFSET_GET(priv->hw) + 1) :
+					  0;
+		status = status & (~(1 << flag));
+		if (!status)
+			dn200_vxlan_unset(netdev);
+		dn200_set_vxlan_status(priv->hw, status);
+	} else {
+		dn200_vxlan_unset(netdev);
+	}
+	return 0;
+}
+
+const struct udp_tunnel_nic_info dn200_udp_tunnels = {
+	.set_port = dn200_vxlan_set_port,
+	.unset_port = dn200_vxlan_unset_port,
+	.flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.tables = {
+		{
+			.n_entries = 1,
+			.tunnel_types = UDP_TUNNEL_TYPE_VXLAN,
+		},
+	},
+};
+
+
+static void dn200_init_ndev_tunnel(struct net_device *ndev)
+{
+	netdev_features_t tso_features;
+
+	tso_features = NETIF_F_GSO_UDP_TUNNEL |
+	    NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_PARTIAL;
+
+	ndev->hw_features |= tso_features;
+	ndev->features |= tso_features;
+	ndev->hw_enc_features |= ndev->features;
+	ndev->vlan_features |= tso_features;
+	/*support gso partial features to calc tunnel out udp csum in kernel */
+	ndev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+	ndev->udp_tunnel_nic_info = &dn200_udp_tunnels;
+}
+
+static void __dn200_disable_all_queues(struct dn200_priv *priv)
+{
+	u32 rx_queues_cnt = priv->plat->rx_queues_to_use;
+	u32 tx_queues_cnt = priv->plat->tx_queues_to_use;
+	u32 maxq = max(rx_queues_cnt, tx_queues_cnt);
+	u32 queue;
+
+	for (queue = 0; queue < maxq; queue++) {
+		struct dn200_channel *ch = &priv->channel[queue];
+
+		if (queue < rx_queues_cnt && queue < tx_queues_cnt &&
+			 priv->txrx_itr_combined) {
+			napi_disable(&ch->agg_napi);
+		} else {
+			if (queue < rx_queues_cnt)
+				napi_disable(&ch->rx_napi);
+			if (queue < tx_queues_cnt)
+				napi_disable(&ch->tx_napi);
+		}
+	}
+}
+
+void dn200_normal_reset(struct dn200_priv *priv)
+{
+	dn200_global_err(priv, DN200_NORMAL_RESET);
+}
+
+void dn200_fw_err_dev_close(struct dn200_priv *priv)
+{
+	if (!PRIV_IS_VF(priv))
+		DN200_SET_LRAM_MAILBOX_MEMBER(priv->hw, pf_fw_err_states, 1);
+	set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+	dn200_global_err(priv, POLL_FW_CQ_TIMEOUT);
+}
+
+/* do hw reset and netdev rlease and open
+ * for pf: notify all vfs to release and open
+ * for vf: notify pf to do global err reset (pf will notify all vfs)
+ */
+void dn200_global_err(struct dn200_priv *priv, enum dn200_err_rst_type err_type)
+{
+	u8 states = 0;
+
+	netif_carrier_off(priv->dev);
+	if (err_type == DN200_TX_TIMEOUT)
+		priv->xstats.tx_timeout_rst_count++;
+	else if (err_type == DN200_DMA_CHAN_ERR)
+		priv->xstats.dma_chan_err_rst_count++;
+
+	if (err_type != DN200_NORMAL_RESET && !test_bit(DN200_DEV_ERR_CLOSE, &priv->state))
+		netdev_info(priv->dev, "%s, %d, global reset type:%d.\n", __func__,
+				__LINE__, err_type);
+
+	if (err_type == POLL_FW_CQ_TIMEOUT || err_type == DN200_PCIE_UNAVAILD_ERR ||
+		err_type == DN200_PHY_MPLLA_UNLOCK) {
+		if (!test_bit(DN200_DOWN, &priv->state) &&
+			!test_and_set_bit(DN200_DEV_ERR_CLOSE, &priv->state)) {
+			/* schedule task to do pf sw & hw reset,
+			 * and notify vf to stop & open netdev
+			 */
+			queue_work(priv->wq, &priv->service_task);
+			return;
+		}
+	}
+
+	/* for critical err, notify pf to do reset */
+	if (PRIV_IS_VF(priv) && err_type != DN200_NORMAL_RESET) {
+		dev_dbg(priv->device,
+			"%s, %d, vf send err rst request to pf.\n", __func__,
+			__LINE__);
+		/* if PF is down, no need to notify pf do reset */
+		DN200_GET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, &states);
+		if (!states)
+			return;
+
+		/* within vf driver, global_err will be called in timer
+		 * processing context when tx timeout,
+		 * can't call dn200_vf_glb_err_rst_notify & irq_peer_notify directly,
+		 * as irq_peer_notify will sleep
+		 */
+		if (!test_and_set_bit(DN200_VF_NOTIFY_PF_RESET, &priv->state))
+			dn200_vf_work(priv);
+		return;
+	}
+
+	/* shedule work queue to do DN200_ERR_RESET */
+	if (!test_bit(DN200_DOWN, &priv->state) &&
+	    !test_and_set_bit(DN200_ERR_RESET, &priv->state)) {
+		dev_dbg(priv->device,
+			"%s, %d, schedule task to do pf sw & hw reset, and notify vf to stop & open netdev.\n",
+			__func__, __LINE__);
+		/* schedule task to do pf sw & hw reset, and notify vf to stop & open netdev */
+
+		queue_work(priv->wq, &priv->service_task);
+	}
+}
+
+void dn200_vf_work(struct dn200_priv *priv)
+{
+	queue_work(priv->wq, &priv->vf_process_task);
+}
+
+/**
+ * dn200_clk_csr_set - dynamically set the MDC clock
+ * @priv: driver private structure
+ * Description: this is to dynamically set the MDC clock according to the csr
+ * clock input.
+ * Note:
+ *	If a specific clk_csr value is passed from the platform
+ *	this means that the CSR Clock Range selection cannot be
+ *	changed at run-time and it is fixed (as reported in the driver
+ *	documentation). Viceversa the driver will try to set the MDC
+ *	clock dynamically according to the actual clock input.
+ */
+static void dn200_clk_csr_set(struct dn200_priv *priv)
+{
+	u32 clk_rate;
+
+	clk_rate = clk_get_rate(priv->plat->dn200_clk);
+
+	/* Platform provided default clk_csr would be assumed valid
+	 * for all other cases except for the below mentioned ones.
+	 * For values higher than the IEEE 802.3 specified frequency
+	 * we can not estimate the proper divider as it is not known
+	 * the frequency of clk_csr_i. So we do not change the default
+	 * divider.
+	 */
+	if (!(priv->clk_csr & MAC_CSR_H_FRQ_MASK)) {
+		if (clk_rate < CSR_F_35M)
+			priv->clk_csr = DN200_CSR_20_35M;
+		else if ((clk_rate >= CSR_F_35M) && (clk_rate < CSR_F_60M))
+			priv->clk_csr = DN200_CSR_35_60M;
+		else if ((clk_rate >= CSR_F_60M) && (clk_rate < CSR_F_100M))
+			priv->clk_csr = DN200_CSR_60_100M;
+		else if ((clk_rate >= CSR_F_100M) && (clk_rate < CSR_F_150M))
+			priv->clk_csr = DN200_CSR_100_150M;
+		else if ((clk_rate >= CSR_F_150M) && (clk_rate < CSR_F_250M))
+			priv->clk_csr = DN200_CSR_150_250M;
+		else if ((clk_rate >= CSR_F_250M) && (clk_rate <= CSR_F_300M))
+			priv->clk_csr = DN200_CSR_250_300M;
+	}
+
+	if (priv->plat->has_xgmac) {
+		if (clk_rate > 400000000)
+			priv->clk_csr = 0x5;
+		else if (clk_rate > 350000000)
+			priv->clk_csr = 0x4;
+		else if (clk_rate > 300000000)
+			priv->clk_csr = 0x3;
+		else if (clk_rate > 250000000)
+			priv->clk_csr = 0x2;
+		else if (clk_rate > 150000000)
+			priv->clk_csr = 0x1;
+		else
+			priv->clk_csr = 0x0;
+	}
+}
+
+static void print_pkt(unsigned char *buf, int len)
+{
+	pr_debug("len = %d byte, buf addr: 0x%p\n", len, buf);
+	print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, buf, len);
+}
+
+static inline u32 dn200_ring_entries_calc(unsigned int ring_size,
+	unsigned int start_idx, unsigned int end_idx)
+{
+	u32 entry_num;
+
+	if (end_idx >= start_idx)
+		entry_num = end_idx - start_idx + 1;
+	else
+		entry_num = ring_size - start_idx + end_idx + 1;
+
+	return entry_num;
+}
+
+static inline u32 dn200_tx_avail(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	u32 avail;
+
+	if (tx_q->dirty_tx > tx_q->cur_tx)
+		avail = tx_q->dirty_tx - tx_q->cur_tx - 1;
+	else
+		avail = priv->dma_tx_size - tx_q->cur_tx + tx_q->dirty_tx - 1;
+
+	return avail;
+}
+
+/**
+ * dn200_rx_dirty - Get RX queue dirty
+ * @priv: driver private structure
+ * @queue: RX queue index
+ */
+static inline u32 dn200_rx_dirty(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	u32 dirty = 0;
+
+	if (rx_q->dirty_rx < rx_q->cur_rx) {
+		dirty = rx_q->cur_rx - rx_q->dirty_rx - 1;
+	} else if (rx_q->dirty_rx > rx_q->cur_rx) {
+		/* dirty_rx point to the end of ring at the beginning,
+		 * when curr_rx equal to dirty_rx means the ring is empty
+		 */
+		dirty = priv->dma_rx_size - rx_q->dirty_rx + rx_q->cur_rx - 1;
+	} else {
+		dev_err(priv->device,
+			"%s, %d, rx ring %d is abormal, as dirty_rx can't equal cur_rx, drity:%d, cur:%d",
+			__func__, __LINE__, queue, rx_q->dirty_rx,
+			rx_q->cur_rx);
+		dirty = 0;
+	}
+
+	return dirty;
+}
+
+static void dn200_lpi_entry_timer_config(struct dn200_priv *priv, bool en)
+{
+	int tx_lpi_timer;
+
+	/* Clear/set the SW EEE timer flag based on LPI ET enablement */
+	priv->eee_sw_timer_en = en ? 0 : 1;
+	tx_lpi_timer = en ? priv->tx_lpi_timer : 0;
+	dn200_set_eee_lpi_timer(priv, priv->hw, tx_lpi_timer);
+}
+
+/**
+ * dn200_enable_eee_mode - check and enter in LPI mode
+ * @priv: driver private structure
+ * Description: this function is to verify and enter in LPI mode in case of
+ * EEE.
+ */
+static int dn200_enable_eee_mode(struct dn200_priv *priv)
+{
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	u32 queue;
+
+	/* check if all TX queues have the work finished */
+	for (queue = 0; queue < tx_cnt; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+
+		if (tx_q->dirty_tx != tx_q->cur_tx)
+			return -EBUSY;	/* still unfinished work */
+	}
+
+	/* Check and enter in LPI mode */
+	if (!priv->tx_path_in_lpi_mode)
+		dn200_set_eee_mode(priv, priv->hw,
+				   priv->plat->en_tx_lpi_clockgating, false);
+	return 0;
+}
+
+/**
+ * dn200_disable_eee_mode - disable and exit from LPI mode
+ * @priv: driver private structure
+ * Description: this function is to exit and disable EEE in case of
+ * LPI state is true. This is called by the xmit.
+ */
+void dn200_disable_eee_mode(struct dn200_priv *priv)
+{
+	if (!priv->eee_sw_timer_en) {
+		dn200_lpi_entry_timer_config(priv, 0);
+		return;
+	}
+
+	dn200_reset_eee_mode(priv, priv->hw);
+	del_timer_sync(&priv->eee_ctrl_timer);
+	priv->tx_path_in_lpi_mode = false;
+}
+
+/**
+ * dn200_eee_ctrl_timer - EEE TX SW timer.
+ * @t:  timer_list struct containing private info
+ * Description:
+ *  if there is no data transfer and if we are not in LPI state,
+ *  then MAC Transmitter can be moved to LPI state.
+ */
+static void dn200_eee_ctrl_timer(struct timer_list *t)
+{
+	struct dn200_priv *priv = from_timer(priv, t, eee_ctrl_timer);
+
+	if (dn200_enable_eee_mode(priv))
+		mod_timer(&priv->eee_ctrl_timer,
+			  DN200_LPI_T(priv->tx_lpi_timer));
+}
+
+/**
+ * dn200_eee_init - init EEE
+ * @priv: driver private structure
+ * Description:
+ *  if the GMAC supports the EEE (from the HW cap reg) and the phy device
+ *  can also manage EEE, this function enable the LPI state and start related
+ *  timer.
+ */
+bool dn200_eee_init(struct dn200_priv *priv)
+{
+	int eee_tw_timer = priv->eee_tw_timer;
+
+	if (PRIV_IS_VF(priv))
+		return false;
+
+	/* Using PCS we cannot dial with the phy registers at this stage
+	 * so we do not support extra feature like EEE.
+	 */
+	if (priv->hw->pcs == DN200_PCS_TBI || priv->hw->pcs == DN200_PCS_RTBI)
+		return false;
+
+	/* Check if MAC core supports the EEE feature. */
+	if (!priv->dma_cap.eee)
+		return false;
+
+	mutex_lock(&priv->lock);
+
+	/* Check if it needs to be deactivated */
+	if (!priv->eee_active) {
+		if (priv->eee_enabled) {
+			netdev_dbg(priv->dev, "disable EEE\n");
+			dn200_lpi_entry_timer_config(priv, 0);
+			del_timer_sync(&priv->eee_ctrl_timer);
+			dn200_set_eee_timer(priv, priv->hw, 0, eee_tw_timer);
+			if (priv->plat_ex->has_xpcs)
+				dn200_xpcs_config_eee(PRIV_PHY_INFO(priv),
+						      priv->plat->mult_fact_100ns, false);
+			dn200_reset_eee_mode(priv, priv->hw);
+		}
+		mutex_unlock(&priv->lock);
+		return false;
+	}
+
+	if (priv->eee_active && !priv->eee_enabled) {
+		timer_setup(&priv->eee_ctrl_timer, dn200_eee_ctrl_timer, 0);
+		dn200_set_eee_timer(priv, priv->hw, DN200_DEFAULT_LIT_LS,
+				    eee_tw_timer);
+		if (priv->plat_ex->has_xpcs)
+			dn200_xpcs_config_eee(PRIV_PHY_INFO(priv),
+					      priv->plat->mult_fact_100ns,
+					      true);
+	}
+
+	if ((priv->mii || priv->plat->has_gmac4) && priv->tx_lpi_timer <= DN200_ET_MAX) {
+		del_timer_sync(&priv->eee_ctrl_timer);
+		priv->tx_path_in_lpi_mode = false;
+		dn200_lpi_entry_timer_config(priv, 1);
+		dn200_set_eee_timer(priv, priv->hw, DN200_DEFAULT_LIT_LS,
+				    0x64);
+		dn200_set_eee_mode(priv, priv->hw,
+				   priv->plat->en_tx_lpi_clockgating, true);
+	} else {
+		dn200_lpi_entry_timer_config(priv, 0);
+		mod_timer(&priv->eee_ctrl_timer,
+			  DN200_LPI_T(priv->tx_lpi_timer));
+	}
+
+	mutex_unlock(&priv->lock);
+	netdev_dbg(priv->dev, "Energy-Efficient Ethernet initialized\n");
+	return true;
+}
+
+/* dn200_get_tx_hwtstamp - get HW TX timestamps
+ * @priv: driver private structure
+ * @p : descriptor pointer
+ * @skb : the socket buffer
+ * Description :
+ * This function will read timestamp from the descriptor & pass it to stack.
+ * and also perform some sanity checks.
+ */
+static void dn200_get_tx_hwtstamp(struct dn200_priv *priv,
+				  struct dma_desc *p, struct sk_buff *skb)
+{
+	struct skb_shared_hwtstamps shhwtstamp;
+	bool found = false;
+	u64 ns = 0;
+
+	if (!priv->hwts_tx_en || PRIV_IS_VF(priv))
+		return;
+
+	/* exit if skb doesn't support hw tstamp */
+	if (likely(!skb || !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)))
+		return;
+
+	/* check tx tstamp status */
+	if (dn200_get_tx_timestamp_status(priv, p)) {
+		dn200_get_timestamp(priv, p, priv->adv_ts, &ns);
+		found = true;
+	} else if (!dn200_get_mac_tx_timestamp(priv, priv->hw, &ns)) {
+		found = true;
+	}
+
+	if (found) {
+		memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
+		shhwtstamp.hwtstamp = ns_to_ktime(ns);
+
+		netdev_dbg(priv->dev, "get valid TX hw timestamp %llu\n", ns);
+		/* pass tstamp to stack */
+		skb_tstamp_tx(skb, &shhwtstamp);
+	}
+}
+
+/* dn200_get_rx_hwtstamp - get HW RX timestamps
+ * @priv: driver private structure
+ * @p : descriptor pointer
+ * @np : next descriptor pointer
+ * @skb : the socket buffer
+ * Description :
+ * This function will read received packet's timestamp from the descriptor
+ * and pass it to stack. It also perform some sanity checks.
+ */
+static void dn200_get_rx_hwtstamp(struct dn200_priv *priv, struct dma_desc *p,
+				  struct dma_desc *np, struct sk_buff *skb)
+{
+	struct skb_shared_hwtstamps *shhwtstamp = NULL;
+	struct dma_desc *desc = p;
+	u64 ns = 0;
+
+	if (!priv->hwts_rx_en || PRIV_IS_VF(priv))
+		return;
+	/* For GMAC4, the valid timestamp is from CTX next desc. */
+	if (priv->plat->has_gmac4 || priv->plat->has_xgmac)
+		desc = np;
+
+	/* Check if timestamp is available */
+	if (dn200_get_rx_timestamp_status(priv, p, np, priv->adv_ts)) {
+		dn200_get_timestamp(priv, desc, priv->adv_ts, &ns);
+
+		netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
+		shhwtstamp = skb_hwtstamps(skb);
+		memset(shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
+		shhwtstamp->hwtstamp = ns_to_ktime(ns);
+	} else {
+		netdev_dbg(priv->dev, "cannot get RX hw timestamp\n");
+	}
+}
+
+/**
+ *  dn200_hwtstamp_set - control hardware timestamping.
+ *  @dev: device pointer.
+ *  @ifr: An IOCTL specific structure, that can contain a pointer to
+ *  a proprietary structure used to pass information to the driver.
+ *  Description:
+ *  This function configures the MAC to enable/disable both outgoing(TX)
+ *  and incoming(RX) packets time stamping based on user input.
+ *  Return Value:
+ *  0 on success and an appropriate -ve integer on failure.
+ */
+static int dn200_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct hwtstamp_config config;
+	u32 ptp_v2 = 0;
+	u32 tstamp_all = 0;
+	u32 ptp_over_ipv4_udp = 0;
+	u32 ptp_over_ipv6_udp = 0;
+	u32 ptp_over_ethernet = 0;
+	u32 snap_type_sel = 0;
+	u32 ts_master_en = 0;
+	u32 ts_event_en = 0;
+
+	if (PRIV_IS_VF(priv)) {
+		netdev_err(priv->dev, "VF No support for HW time stamping\n");
+		return -EOPNOTSUPP;
+	}
+	if (!(priv->dma_cap.time_stamp || priv->adv_ts)) {
+		netdev_alert(priv->dev, "No support for HW time stamping\n");
+		priv->hwts_tx_en = 0;
+		priv->hwts_rx_en = 0;
+
+		return -EOPNOTSUPP;
+	}
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	netdev_dbg(priv->dev,
+		   "%s config flags:0x%x, tx_type:0x%x, rx_filter:0x%x\n",
+		   __func__, config.flags, config.tx_type, config.rx_filter);
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+	if (config.tx_type != HWTSTAMP_TX_OFF &&
+	    config.tx_type != HWTSTAMP_TX_ON)
+		return -ERANGE;
+	if (priv->adv_ts) {
+		switch (config.rx_filter) {
+		case HWTSTAMP_FILTER_NONE:
+			/* time stamp no incoming packet at all */
+			config.rx_filter = HWTSTAMP_FILTER_NONE;
+			break;
+		case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+			/* PTP v1, UDP, any kind of event packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+			/* 'xmac' hardware can support Sync, Pdelay_Req and
+			 * Pdelay_resp by setting bit14 and bits17/16 to 01
+			 * This leaves Delay_Req timestamps out.
+			 * Enable all events *and* general purpose message
+			 * timestamping
+			 */
+			snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+			/* PTP v1, UDP, Sync packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_SYNC;
+			/* take time stamp for SYNC messages only */
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+			/* PTP v1, UDP, Delay_req packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ;
+			/* take time stamp for Delay_Req messages only */
+			ts_master_en = PTP_TCR_TSMSTRENA;
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+			/* PTP v2, UDP, any kind of event packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			/* take time stamp for all event messages */
+			snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+			/* PTP v2, UDP, Sync packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_SYNC;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			/* take time stamp for SYNC messages only */
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+			/* PTP v2, UDP, Delay_req packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			/* take time stamp for Delay_Req messages only */
+			ts_master_en = PTP_TCR_TSMSTRENA;
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_EVENT:
+			/* PTP v2/802.AS1 any layer, any kind of event packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+			if (priv->chip_id < DWMAC_CORE_4_10)
+				ts_event_en = PTP_TCR_TSEVNTENA;
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			ptp_over_ethernet = PTP_TCR_TSIPENA;
+			tstamp_all = PTP_TCR_TSENALL;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_SYNC:
+			/* PTP v2/802.AS1, any layer, Sync packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_SYNC;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			/* take time stamp for SYNC messages only */
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			ptp_over_ethernet = PTP_TCR_TSIPENA;
+			break;
+		case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+			/* PTP v2/802.AS1, any layer, Delay_req packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V2_DELAY_REQ;
+			ptp_v2 = PTP_TCR_TSVER2ENA;
+			/* take time stamp for Delay_Req messages only */
+			ts_master_en = PTP_TCR_TSMSTRENA;
+			ts_event_en = PTP_TCR_TSEVNTENA;
+
+			ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
+			ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
+			ptp_over_ethernet = PTP_TCR_TSIPENA;
+			break;
+		case HWTSTAMP_FILTER_NTP_ALL:
+		case HWTSTAMP_FILTER_ALL:
+			/* time stamp any incoming packet */
+			config.rx_filter = HWTSTAMP_FILTER_ALL;
+			tstamp_all = PTP_TCR_TSENALL;
+			break;
+		default:
+			return -ERANGE;
+		}
+	} else {
+		switch (config.rx_filter) {
+		case HWTSTAMP_FILTER_NONE:
+			config.rx_filter = HWTSTAMP_FILTER_NONE;
+			break;
+		default:
+			/* PTP v1, UDP, any kind of event packet */
+			config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+			break;
+		}
+	}
+
+	priv->systime_flags = DN200_HWTS_ACTIVE;
+
+	if (priv->hwts_tx_en || priv->hwts_rx_en) {
+		priv->systime_flags |= tstamp_all | ptp_v2 |
+		    ptp_over_ethernet | ptp_over_ipv6_udp |
+		    ptp_over_ipv4_udp | ts_event_en |
+		    ts_master_en | snap_type_sel;
+	}
+
+	dn200_config_hw_tstamping(priv, priv->ptpaddr, priv->systime_flags);
+
+	memcpy(&priv->tstamp_config, &config, sizeof(config));
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config))
+	    ? -EFAULT : 0;
+}
+
+/**
+ *  dn200_hwtstamp_get - read hardware timestamping.
+ *  @dev: device pointer.
+ *  @ifr: An IOCTL specific structure, that can contain a pointer to
+ *  a proprietary structure used to pass information to the driver.
+ *  Description:
+ *  This function obtain the current hardware timestamping settings
+ *  as requested.
+ */
+static int dn200_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct hwtstamp_config *config = &priv->tstamp_config;
+
+	if (PRIV_IS_VF(priv)) {
+		netdev_err(priv->dev, "VF No support for HW time stamping\n");
+		return -EOPNOTSUPP;
+	}
+	if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, config, sizeof(*config))
+	    ? -EFAULT : 0;
+}
+
+/**
+ * dn200_init_tstamp_counter - init hardware timestamping counter
+ * @priv: driver private structure
+ * @systime_flags: timestamping flags
+ * Description:
+ * Initialize hardware counter for packet timestamping.
+ * This is valid as long as the interface is open and not suspended.
+ * Will be rerun after resuming from suspend, case in which the timestamping
+ * flags updated by dn200_hwtstamp_set() also need to be restored.
+ */
+int dn200_init_tstamp_counter(struct dn200_priv *priv, u32 systime_flags)
+{
+	bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	struct timespec64 now;
+	u32 sec_inc = 0;
+	u64 temp = 0;
+
+	if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp))
+		return -EOPNOTSUPP;
+
+	dn200_config_hw_tstamping(priv, priv->ptpaddr, systime_flags);
+	priv->systime_flags = systime_flags;
+
+	/* program Sub Second Increment reg */
+	dn200_config_sub_second_increment(priv, priv->ptpaddr,
+					  priv->plat->clk_ptp_rate,
+					  xmac, &sec_inc);
+	temp = div_u64(1000000000ULL, sec_inc);
+
+	/* Store sub second increment for later use */
+	priv->sub_second_inc = sec_inc;
+
+	/* calculate default added value:
+	 * formula is :
+	 * addend = (2^32)/freq_div_ratio;
+	 * where, freq_div_ratio = 1e9ns/sec_inc
+	 */
+	temp = (u64)(temp << 32);
+	priv->default_addend = div_u64(temp, priv->plat->clk_ptp_rate);
+	dn200_config_addend(priv, priv->ptpaddr, priv->default_addend);
+
+	/* initialize system time */
+	ktime_get_real_ts64(&now);
+
+	/* lower 32 bits of tv_sec are safe until y2106 */
+	dn200_init_systime(priv, priv->ptpaddr, (u32)now.tv_sec, now.tv_nsec);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dn200_init_tstamp_counter);
+
+/**
+ * dn200_init_ptp - init PTP
+ * @priv: driver private structure
+ * Description: this is to verify if the HW supports the PTPv1 or PTPv2.
+ * This is done by looking at the HW cap. register.
+ * This function also registers the ptp driver.
+ */
+static int dn200_init_ptp(struct dn200_priv *priv)
+{
+	bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	int ret;
+
+	if (PRIV_IS_VF(priv))
+		return -EOPNOTSUPP;
+	ret = dn200_init_tstamp_counter(priv, DN200_HWTS_ACTIVE);
+	if (ret)
+		return ret;
+
+	priv->adv_ts = 0;
+	/* Check if adv_ts can be enabled for dwmac 4.x / xgmac core */
+	if (xmac && priv->dma_cap.atime_stamp)
+		priv->adv_ts = 1;
+
+	if (priv->adv_ts) {
+		netdev_dbg(priv->dev,
+				"IEEE 1588-2008 Advanced Timestamp supported\n");
+	}
+
+	return 0;
+}
+
+static void dn200_release_ptp(struct dn200_priv *priv)
+{
+	clk_disable_unprepare(priv->plat->clk_ptp_ref);
+	dn200_ptp_unregister(priv);
+}
+
+/**
+ *  dn200_mac_flow_ctrl - Configure flow control in all queues
+ *  @priv: driver private structure
+ *  @duplex: duplex passed to the next function
+ *  Description: It is used for configuring the flow control in all queues
+ */
+static void dn200_mac_flow_ctrl(struct dn200_priv *priv, u32 duplex)
+{
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+
+	priv->duplex = duplex;
+	dn200_flow_ctrl(priv, priv->hw, priv->duplex, priv->flow_ctrl,
+			priv->pause, tx_cnt);
+}
+
+static void dn200_xgmac_halfduplex_set(struct dn200_priv *priv, int duplex,
+				       phy_interface_t interface)
+{
+	u32 old_ctrl, ctrl;
+
+	if (priv->plat->has_xgmac) {
+		if (unlikely(priv->plat->tx_queues_to_use == 1 &&
+			     interface == PHY_INTERFACE_MODE_RGMII &&
+			     (priv->speed == SPEED_10 ||
+			      priv->speed == SPEED_100 ||
+			      priv->speed == SPEED_1000))) {
+			old_ctrl = readl(priv->ioaddr +
+					 XGMAC_MAC_EXT_CONF); // offset 0x140
+			ctrl = old_ctrl;
+			if (!duplex)
+				ctrl |= XGMAC_MACEXT_HD;
+			else
+				ctrl &= ~XGMAC_MACEXT_HD;
+			if (ctrl != old_ctrl)
+				writel(ctrl, priv->ioaddr + XGMAC_MAC_EXT_CONF);
+		}
+	}
+}
+
+static void tx_clean_by_loopback(struct dn200_priv *priv)
+{
+	u32 value = 0;
+	u8 vf_carrier_state = 0;
+	u8 vf_offset = 0;
+	u8 reg_info = 0;
+	u8 probe_bitmap = 0, wb_bitmap = 0;
+	int i = 0;
+	unsigned long in_time_start = jiffies;
+
+	if (test_bit(DN200_DOWN, &priv->state))
+		return;
+
+	value = readl(priv->ioaddr + XGMAC_MAC_DEBUG);
+	if (!value)
+		return;
+
+	if (!priv->plat_ex->sriov_cfg)
+		goto loopback;
+
+	DN200_ITR_SYNC_SET(priv->hw, pf_carrier, 0, 1);
+	irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+	while (true) {
+		probe_bitmap = 0;
+		wb_bitmap = 0;
+		// num_vf_probe = priv->plat_ex->pf.registered_vfs;
+		for (vf_offset = 0; vf_offset < priv->plat_ex->pf.registered_vfs; vf_offset++) {
+			DN200_HEARTBEAT_GET(priv->hw, registered_vf_state, vf_offset, &reg_info);
+			if ((reg_info & DN200_VF_REG_STATE_OPENED))
+				probe_bitmap |= (1 << vf_offset);
+
+			DN200_ITR_SYNC_GET(priv->hw, vf_carrier, vf_offset,
+								 &vf_carrier_state);
+			if (vf_carrier_state) {
+				wb_bitmap |= (1 << vf_offset);
+				continue;
+			}
+			usleep_range(1000, 2000);
+		}
+		if ((wb_bitmap & probe_bitmap) == probe_bitmap) {
+			for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++) {
+				DN200_ITR_SYNC_SET(priv->hw, vf_carrier, i,
+								 0);
+			}
+			DN200_ITR_SYNC_SET(priv->hw, pf_carrier, 0, 0);
+			break;
+		}
+
+		if (time_after(jiffies, in_time_start + msecs_to_jiffies(500))) {
+			netdev_dbg(priv->dev, "%s %d wb_bitmap %d probe_bitmap %d\n", __func__, __LINE__, wb_bitmap, probe_bitmap);
+			netdev_warn(priv->dev, "%s,%d PF notify VF do link down timeout\n",
+							__func__, __LINE__);
+			for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++) {
+				DN200_ITR_SYNC_SET(priv->hw, vf_carrier, i,
+								 0);
+			}
+			DN200_ITR_SYNC_SET(priv->hw, pf_carrier, 0, 0);
+			break;
+		}
+	}
+loopback:
+	dn200_set_mac_loopback(priv, priv->ioaddr, true);
+	usleep_range(2000, 3000);
+}
+
+void dn200_normal_close_open(struct dn200_priv *priv)
+{
+	queue_work(priv->wq, &priv->service_task);
+}
+
+static void dn200_mac_link_down(struct dn200_priv *priv,
+				unsigned int mode, phy_interface_t interface)
+{
+	set_bit(DN200_MAC_LINK_DOWN, &priv->state);
+	queue_work(priv->wq, &priv->service_task);
+	// dn200_wq_mac_link_down(priv);
+}
+
+
+static void dn200_wq_mac_link_down(struct dn200_priv *priv)
+{
+	if (priv->mii)
+		/* set rgmii rx clock from soc */
+		dn200_xgmac_rx_ext_clk_set(priv, false);
+	tx_clean_by_loopback(priv);
+	dn200_mac_set(priv, priv->ioaddr, false, priv->hw);
+	priv->eee_active = false;
+	priv->tx_lpi_enabled = false;
+	priv->eee_enabled = dn200_eee_init(priv);
+	dn200_set_eee_pls(priv, priv->hw, false);
+	dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+	memset(&priv->hw->set_state, 0, sizeof(struct dn200_set_state));
+	clear_bit(DN200_MAC_LINK_DOWN, &priv->state);
+}
+
+static void dn200_set_itr_divisor(struct dn200_priv *priv, u32 speed)
+{
+	int i = 0;
+	int itr_div = 0;
+
+	switch (speed) {
+	case SPEED_10000:
+		itr_div = 512;
+		break;
+	default:
+		itr_div = 64;
+		break;
+	}
+
+	for (; i < MTL_MAX_RX_QUEUES; i++) {
+		priv->rx_intr[i].itr_div = itr_div;
+		priv->rx_intr[i].target_itr = 0x10;
+		priv->rx_intr[i].current_itr = 0x10;
+	}
+
+	for (i = 0; i < MTL_MAX_TX_QUEUES; i++) {
+		priv->tx_intr[i].itr_div = itr_div;
+		priv->tx_intr[i].target_itr = 0x40;
+		priv->tx_intr[i].current_itr = 0x40;
+	}
+}
+
+static void dn200_rx_itr_update(struct dn200_itr_info *itr,
+				struct dn200_priv *priv, u8 chan);
+static void dn200_update_1G_speed_itr(struct dn200_itr_info *itr,
+				struct dn200_priv *priv, u8 chan);
+static const struct itr_update_ops dn200_itr_update_ops = {
+	.dn200_rx_itr_update = dn200_rx_itr_update,
+};
+
+static const struct itr_update_ops dn200_itr_update_ops_1G = {
+	.dn200_rx_itr_update = dn200_update_1G_speed_itr,
+};
+
+static inline void dn200_rx_itr_usec_update(struct dn200_priv *priv);
+static void dn200_vf_mac_link_up(struct dn200_priv *priv,
+				 struct phy_device *phy,
+				 unsigned int mode, phy_interface_t interface,
+				 int speed, int duplex,
+				 bool tx_pause, bool rx_pause)
+{
+	priv->speed = speed;
+	dn200_rx_itr_usec_update(priv);
+	if (speed == SPEED_10000)
+		priv->dn200_update_ops = &dn200_itr_update_ops;
+	else
+		priv->dn200_update_ops = &dn200_itr_update_ops_1G;
+	dn200_set_itr_divisor(priv, speed);
+}
+
+static void dn200_xgmac_rx_ext_clk_set(struct dn200_priv *priv, bool external)
+{
+	u32 reg_val;
+
+	/* disable clk_rx_180 and phy_clk_rx */
+	reg_val =
+	    readl(priv->ioaddr +
+		  XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+	writel(reg_val & (~(BIT(4) | BIT(6))),
+	       priv->ioaddr +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+
+	/* set xgmac clk mux */
+	reg_val =
+	    readl(priv->ioaddr + XGE_XGMAC_CLK_MUX_CTRL(priv->plat_ex->funcid));
+	if (external)
+		reg_val &= ~(BIT(17));
+	else
+		reg_val |= BIT(17);
+	writel(reg_val,
+	       priv->ioaddr + XGE_XGMAC_CLK_MUX_CTRL(priv->plat_ex->funcid));
+
+	/* enable clk_rx_180 and phy_clk_rx */
+	reg_val =
+	    readl(priv->ioaddr +
+		  XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+	writel(reg_val | (BIT(4) | BIT(6)),
+	       priv->ioaddr +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+}
+
+static void dn200_xgmac_ge_tx_clk_set(struct dn200_priv *priv, u32 speed)
+{
+	u32 reg_val;
+
+	/* disable clk_tx_div */
+	reg_val =
+	    readl(priv->ioaddr +
+		  XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+	writel(reg_val & (~(BIT(3))),
+	       priv->ioaddr +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+
+	/* reset xge clk gen module */
+	reg_val =
+	    readl(priv->ioaddr +
+			XGE_XGMAC_XPCS_SW_RST(priv->plat_ex->funcid));
+	writel(reg_val & (~BIT(3)),
+			priv->ioaddr +
+			XGE_XGMAC_XPCS_SW_RST(priv->plat_ex->funcid));
+	writel(reg_val | BIT(3),
+			priv->ioaddr +
+			XGE_XGMAC_XPCS_SW_RST(priv->plat_ex->funcid));
+
+	/* set ge tx clk */
+	reg_val =
+	    readl(priv->ioaddr + XGE_XGMAC_CLK_TX_CTRL(priv->plat_ex->funcid));
+	reg_val &= ~(BIT(1) | BIT(2) | BIT(3));
+
+	switch (speed) {
+	case SPEED_10:
+		reg_val |= BIT(2);
+		break;
+	case SPEED_100:
+		reg_val |= BIT(1);
+		break;
+	default:
+		break;
+	}
+	writel(reg_val,
+	       priv->ioaddr + XGE_XGMAC_CLK_TX_CTRL(priv->plat_ex->funcid));
+
+	/* enable clk_tx_div */
+	reg_val =
+	    readl(priv->ioaddr +
+		  XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+	writel(reg_val | BIT(3),
+	       priv->ioaddr +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(priv->plat_ex->funcid));
+}
+
+static void dn200_xgmac_rgmii_speed_set(struct dn200_priv *priv,
+					phy_interface_t interface, u32 speed)
+{
+	u32 ctrl;
+
+	if (interface == PHY_INTERFACE_MODE_RGMII ||
+	    interface == PHY_INTERFACE_MODE_RGMII_ID ||
+	    interface == PHY_INTERFACE_MODE_RGMII_RXID ||
+	    interface == PHY_INTERFACE_MODE_RGMII_TXID) {
+		ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
+		ctrl &= ~priv->hw->link.speed_mask;
+
+		switch (speed) {
+		case SPEED_1000:
+			ctrl |= priv->hw->link.speed1000;
+			break;
+		case SPEED_100:
+			ctrl |= priv->hw->link.speed100;
+			break;
+		case SPEED_10:
+			ctrl |= priv->hw->link.speed10;
+			break;
+		default:
+			return;
+		}
+		writel(ctrl, priv->ioaddr + MAC_CTRL_REG);
+		dn200_xgmac_ge_tx_clk_set(priv, speed);
+	}
+}
+
+static void dn200_mac_link_up(struct dn200_priv *priv,
+			      struct phy_device *phy,
+			      unsigned int mode, phy_interface_t interface,
+			      int speed, int duplex,
+			      bool tx_pause, bool rx_pause)
+{
+	u32 old_ctrl, ctrl;
+
+	if (priv->mii)
+		/* set rgmii rx clock from phy */
+		dn200_xgmac_rx_ext_clk_set(priv, true);
+
+	old_ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
+	ctrl = old_ctrl;
+
+	dn200_set_itr_divisor(priv, speed);
+
+	if (!duplex)
+		ctrl &= ~priv->hw->link.duplex;
+	else
+		ctrl |= priv->hw->link.duplex;
+
+	/* Flow Control operation */
+	dn200_mac_flow_ctrl(priv, duplex);
+	if (ctrl != old_ctrl)
+		writel(ctrl, priv->ioaddr + MAC_CTRL_REG);
+
+	/* Make sure that speed select has been completely written. */
+	dma_wmb();
+	dn200_xgmac_halfduplex_set(priv, duplex, interface);
+	dn200_xgmac_rgmii_speed_set(priv, interface, speed);
+
+	dn200_mac_set(priv, priv->ioaddr, true, priv->hw);
+	if (phy && priv->dma_cap.eee) {
+		priv->eee_active = PRIV_PHY_OPS(priv)->init_eee(PRIV_PHY_INFO(priv), 1) >= 0;
+		priv->eee_enabled = dn200_eee_init(priv);
+		priv->tx_lpi_enabled = priv->eee_enabled;
+		dn200_set_eee_pls(priv, priv->hw, true);
+	}
+
+	dn200_mmc_err_clear(priv, priv->mmcaddr);
+
+	if (speed == 10000) {
+		priv->max_usecs = DN200_ITR_MAX_USECS;
+		priv->min_usecs = DN200_ITR_MIN_USECS;
+	} else {
+		priv->max_usecs = dn200_riwt2usec(DN200_ITR_MAX_RWT, priv);
+		priv->min_usecs = DN200_ITR_MIN_USECS_1G;
+	}
+	priv->speed = speed;
+	dn200_rx_itr_usec_update(priv);
+	if (speed == 10000)
+		priv->dn200_update_ops = &dn200_itr_update_ops;
+	else
+		priv->dn200_update_ops = &dn200_itr_update_ops_1G;
+
+	dn200_set_mac_loopback(priv, priv->ioaddr,
+			       !!(priv->dev->features & NETIF_F_LOOPBACK));
+}
+
+static void dn200_mac_speed_set(struct dn200_priv *priv,
+				phy_interface_t interface, int speed)
+{
+	u32 old_ctrl, ctrl;
+
+	old_ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
+	ctrl = old_ctrl & ~priv->hw->link.speed_mask;
+	if (interface == PHY_INTERFACE_MODE_XGMII) {
+		switch (speed) {
+		case SPEED_10000:
+			ctrl |= priv->hw->link.xgmii.speed10000;
+			break;
+		case SPEED_2500:
+			ctrl |= priv->hw->link.speed2500;
+			break;
+		case SPEED_1000:
+			ctrl |= priv->hw->link.speed1000;
+			break;
+		default:
+			return;
+		}
+	} else {
+		switch (speed) {
+		case SPEED_2500:
+			ctrl |= priv->hw->link.speed2500;
+			break;
+		case SPEED_1000:
+			ctrl |= priv->hw->link.speed1000;
+			break;
+		case SPEED_100:
+			ctrl |= priv->hw->link.speed100;
+			break;
+		case SPEED_10:
+			ctrl |= priv->hw->link.speed10;
+			break;
+		default:
+			return;
+		}
+	}
+
+	priv->speed = speed;
+	dev_dbg(priv->device, "%s %d speed %d ctrl %#x\n", __func__, __LINE__,
+		speed, ctrl);
+	writel(ctrl, priv->ioaddr + MAC_CTRL_REG);
+}
+
+static const struct dn200_mac_ops dn200_phy_mac_ops = {
+	.mac_link_down = dn200_mac_link_down,
+	.mac_link_up = dn200_mac_link_up,
+	.mac_speed_set = dn200_mac_speed_set,
+};
+
+static const struct dn200_mac_ops dn200_vf_phy_mac_ops = {
+	.mac_link_up = dn200_vf_mac_link_up,
+};
+
+/**
+ * dn200_check_pcs_mode - verify if RGMII/SGMII is supported
+ * @priv: driver private structure
+ * Description: this is to verify if the HW supports the PCS.
+ * Physical Coding Sublayer (PCS) interface that can be used when the MAC is
+ * configured for the TBI, RTBI, or SGMII PHY interface.
+ */
+static void dn200_check_pcs_mode(struct dn200_priv *priv)
+{
+	int interface = priv->plat->phy_interface;
+
+	if (!priv->plat_ex->has_xpcs) {
+		if (interface == PHY_INTERFACE_MODE_RGMII ||
+		    interface == PHY_INTERFACE_MODE_RGMII_ID ||
+		    interface == PHY_INTERFACE_MODE_RGMII_RXID ||
+		    interface == PHY_INTERFACE_MODE_RGMII_TXID) {
+			netdev_dbg(priv->dev, "PCS RGMII support enabled\n");
+			priv->hw->pcs = DN200_PCS_RGMII;
+			if (!priv->plat->mac_port_sel_speed)
+				priv->plat->mac_port_sel_speed = SPEED_1000;
+		} else if (interface == PHY_INTERFACE_MODE_SGMII) {
+			netdev_dbg(priv->dev, "PCS SGMII support enabled\n");
+			priv->hw->pcs = DN200_PCS_SGMII;
+		}
+	}
+}
+
+/**
+ * dn200_init_phy - PHY initialization
+ * @dev: net device structure
+ * Description: it initializes the driver's PHY state, and attaches the PHY
+ * to the mac driver.
+ *  Return value:
+ *  0 on success
+ */
+static int dn200_init_phy(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (PRIV_PHY_INFO(priv) && PRIV_PHY_OPS(priv))
+		return PRIV_PHY_OPS(priv)->init(PRIV_PHY_INFO(priv));
+	return 0;
+}
+
+static void dn200_display_rx_rings(struct dn200_priv *priv)
+{
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	unsigned int desc_size;
+	void *head_rx;
+	u32 queue;
+
+	/* Display RX rings */
+	for (queue = 0; queue < rx_cnt; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		pr_info("\tRX Queue %u rings\n", queue);
+
+		head_rx = (void *)rx_q->dma_rx;
+		desc_size = sizeof(struct dma_desc);
+		/* Display RX ring */
+		dn200_display_ring(priv, head_rx, priv->dma_rx_size, true,
+				   rx_q->dma_rx_phy, desc_size, priv->hw);
+	}
+}
+
+static void dn200_display_tx_rings(struct dn200_priv *priv)
+{
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	unsigned int desc_size;
+	void *head_tx;
+	u32 queue;
+	bool flags = true;
+	/* Display TX rings */
+	for (queue = 0; queue < tx_cnt; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+
+		pr_info("\tTX Queue %d rings\n", queue);
+
+		head_tx = (void *)tx_q->dma_tx;
+		desc_size = sizeof(struct dma_desc);
+
+		dn200_display_ring(priv, head_tx, priv->dma_tx_size, flags,
+				   tx_q->dma_tx_phy, desc_size, priv->hw);
+	}
+}
+
+static void dn200_display_rings(struct dn200_priv *priv)
+{
+	/* Display RX ring */
+	dn200_display_rx_rings(priv);
+
+	/* Display TX ring */
+	dn200_display_tx_rings(priv);
+}
+
+/**
+ * dn200_clear_rx_descriptors - clear RX descriptors
+ * @priv: driver private structure
+ * @queue: RX queue index
+ * Description: this function is called to clear the RX descriptors
+ * in case of both basic and extended descriptors are used.
+ */
+static void dn200_clear_rx_descriptors(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	int i;
+
+	/* Clear the RX descriptors */
+	for (i = 0; i < priv->dma_rx_size - 1; i++)
+		dn200_init_rx_desc(priv, &rx_q->dma_rx[i],
+				   priv->use_riwt, priv->mode,
+				   (i == priv->dma_rx_size - 1),
+				   priv->dma_buf_sz);
+}
+
+/**
+ * dn200_clear_tx_descriptors - clear tx descriptors
+ * @priv: driver private structure
+ * @queue: TX queue index.
+ * Description: this function is called to clear the TX descriptors
+ * in case of both basic and extended descriptors are used.
+ */
+static void dn200_clear_tx_descriptors(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	int i;
+
+	/* Clear the TX descriptors */
+	for (i = 0; i < priv->dma_tx_size; i++) {
+		int last = (i == (priv->dma_tx_size - 1));
+		struct dma_desc *p;
+
+		p = &tx_q->dma_tx[i];
+
+		dn200_init_tx_desc(priv, p, priv->mode, last);
+	}
+}
+
+/**
+ * dn200_clear_descriptors - clear descriptors
+ * @priv: driver private structure
+ * Description: this function is called to clear the TX and RX descriptors
+ * in case of both basic and extended descriptors are used.
+ */
+static void dn200_clear_descriptors(struct dn200_priv *priv)
+{
+	u32 rx_queue_cnt = priv->plat->rx_queues_to_use;
+	u32 tx_queue_cnt = priv->plat->tx_queues_to_use;
+	u32 queue;
+
+	/* Clear the RX descriptors */
+	for (queue = 0; queue < rx_queue_cnt; queue++)
+		dn200_clear_rx_descriptors(priv, queue);
+
+	/* Clear the TX descriptors */
+	for (queue = 0; queue < tx_queue_cnt; queue++)
+		dn200_clear_tx_descriptors(priv, queue);
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+static int dn200_alloc_page(struct dn200_priv *priv,
+			    struct dn200_rx_queue *rx_q,
+			    struct dn200_rx_buffer *buf, char index, int offset,
+			    int dma_rx_size)
+{
+	struct dn200_page_buf *pg_buf = NULL;
+
+	pg_buf =
+	    dn200_rx_pool_buf_alloc(rx_q->rx_pool, rx_q->queue_index, offset,
+				    dma_rx_size);
+	if (unlikely(!pg_buf))
+		return -ENOMEM;
+
+	buf->pg_buf = pg_buf;
+	if (index == FIRST_PAGE) {
+		buf->page = pg_buf->page;
+		buf->desc_addr = pg_buf->desc_addr;
+		buf->kernel_addr = pg_buf->kernel_addr;
+		buf->page_offset = pg_buf->page_offset;
+		buf->rx_times = 0;
+	} else {
+		buf->sec_page = pg_buf->page;
+		buf->sec_addr = pg_buf->kernel_addr;
+		buf->sec_page_offset = pg_buf->page_offset;
+	}
+
+	return 0;
+}
+
+#pragma GCC diagnostic pop
+/**
+ * dn200_init_rx_buffers - init the RX descriptor buffer.
+ * @priv: driver private structure
+ * @p: descriptor pointer
+ * @i: descriptor index
+ * @flags: gfp flag
+ * @queue: RX queue index
+ * Description: this function is called to allocate a receive buffer, perform
+ * the DMA mapping and init the descriptor.
+ */
+static int dn200_init_rx_buffers(struct dn200_priv *priv, struct dma_desc *p,
+				 int i, gfp_t flags, u32 queue)
+{
+	int ret;
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	struct dn200_rx_buffer *buf = &rx_q->buf_pool[i];
+
+	if (!buf->page) {
+		ret =
+		    dn200_alloc_page(priv, rx_q, buf, FIRST_PAGE, i,
+				     priv->dma_rx_size);
+		if (ret)
+			return ret;
+		dn200_set_desc_addr(priv, p, buf->desc_addr + buf->page_offset,
+				    priv->hw);
+	}
+
+	if (priv->sph && !buf->sec_page) {
+		ret =
+		    dn200_alloc_page(priv, rx_q, buf, SECON_PAGE, i,
+				     priv->dma_rx_size);
+		if (ret)
+			return ret;
+		dn200_set_desc_sec_addr(priv, p, buf->sec_addr, true, priv->hw);
+	} else {
+		buf->sec_page = NULL;
+		dn200_set_desc_sec_addr(priv, p, buf->sec_addr, false,
+					priv->hw);
+	}
+
+	if (priv->dma_buf_sz == BUF_SIZE_16KiB)
+		dn200_init_desc3(priv, p);
+
+	return 0;
+}
+
+static inline void dn200_free_dma32_tx_buffer(struct dn200_priv *priv,
+					      struct dn200_tx_queue *tx_q,
+					      int entry)
+{
+	if (tx_q->tx_dma32_bufs[entry].mem_type != DN200_DMA32)
+		return;
+	__free_pages(tx_q->tx_dma32_bufs[entry].page,
+		     tx_q->tx_dma32_bufs[entry].order);
+
+	dma_unmap_page(priv->device, tx_q->tx_dma32_bufs[entry].buf,
+		       tx_q->tx_dma32_bufs[entry].len, DMA_TO_DEVICE);
+	tx_q->tx_dma32_bufs[entry].mem_type = DN200_NORMAL;
+	tx_q->tx_dma32_bufs[entry].page = NULL;
+	tx_q->tx_dma32_bufs[entry].len = 0;
+	tx_q->tx_dma32_bufs[entry].order = 0;
+	if (tx_q->tx_dma32_bufs[entry].iatu_ref_ptr) {
+		atomic_sub(1, tx_q->tx_dma32_bufs[entry].iatu_ref_ptr);
+		tx_q->tx_dma32_bufs[entry].iatu_ref_ptr = NULL;
+	}
+}
+
+/**
+ * dn200_free_tx_buffer - free RX dma buffers
+ * @priv: private structure
+ * @queue: RX queue index
+ * @i: buffer index.
+ */
+static void dn200_free_tx_buffer(struct dn200_priv *priv, u32 queue, int i)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+
+	if (!tx_q->tx_skbuff_dma)
+		return;
+
+	if (tx_q->tx_skbuff_dma[i].buf &&
+	    tx_q->tx_skbuff_dma[i].buf_type != DN200_TXBUF_T_XDP_TX) {
+		if (tx_q->tx_skbuff_dma[i].map_as_page)
+			dma_unmap_page(priv->device,
+				       tx_q->tx_skbuff_dma[i].buf,
+				       tx_q->tx_skbuff_dma[i].len,
+				       DMA_TO_DEVICE);
+		else
+			dma_unmap_single(priv->device,
+					 tx_q->tx_skbuff_dma[i].buf,
+					 tx_q->tx_skbuff_dma[i].len,
+					 DMA_TO_DEVICE);
+	}
+	dn200_free_dma32_tx_buffer(priv, tx_q, i);
+
+	if (tx_q->tx_skbuff_dma[i].buf_type == DN200_TXBUF_T_XSK_TX)
+		tx_q->xsk_frames_done++;
+
+	if (tx_q->tx_skbuff[i] &&
+	    tx_q->tx_skbuff_dma[i].buf_type == DN200_TXBUF_T_SKB) {
+		dev_kfree_skb_any(tx_q->tx_skbuff[i]);
+		tx_q->tx_skbuff[i] = NULL;
+	}
+
+	tx_q->tx_skbuff_dma[i].buf = 0;
+	tx_q->tx_skbuff_dma[i].map_as_page = false;
+	if (tx_q->tx_skbuff_dma[i].iatu_ref_ptr) {
+		atomic_sub(1, tx_q->tx_skbuff_dma[i].iatu_ref_ptr);
+		tx_q->tx_skbuff_dma[i].iatu_ref_ptr = NULL;
+	}
+}
+
+static int dn200_alloc_rx_buffers(struct dn200_priv *priv, u32 queue,
+				  gfp_t flags)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	int i;
+	struct dma_desc *p;
+	int ret;
+
+	/* keep one buff as free otherwise we don't known the queue is empty or full */
+	for (i = 0; i < priv->dma_rx_size - 1; i++) {
+		p = rx_q->dma_rx + i;
+		ret = dn200_init_rx_buffers(priv, p, i, flags, queue);
+		if (ret)
+			return ret;
+		rx_q->buf_alloc_num++;
+	}
+
+	return 0;
+}
+
+/**
+ * dma_free_rx_xskbufs - free RX dma buffers from XSK pool
+ * @priv: private structure
+ * @queue: RX queue index
+ */
+/**
+ * __init_dma_rx_desc_rings - init the RX descriptor ring (per queue)
+ * @priv: driver private structure
+ * @queue: RX queue index
+ * @flags: gfp flag.
+ * Description: this function initializes the DMA RX descriptors
+ * and allocates the socket buffers. It supports the chained and ring
+ * modes.
+ */
+static int __init_dma_rx_desc_rings(struct dn200_priv *priv, u32 queue,
+				    gfp_t flags)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	int ret;
+
+	netif_dbg(priv, probe, priv->dev,
+		  "(%s) dma_rx_phy=0x%08x\n", __func__, (u32)rx_q->dma_rx_phy);
+
+	dn200_clear_rx_descriptors(priv, queue);
+	ret = dn200_alloc_rx_buffers(priv, queue, flags);
+	if (ret < 0)
+		return -ENOMEM;
+	rx_q->cur_rx = 0;
+	rx_q->alloc_rx = rx_q->buf_alloc_num;
+	rx_q->dirty_rx = rx_q->buf_alloc_num;
+
+	/* Setup the chained descriptor addresses */
+	if (priv->mode == DN200_CHAIN_MODE) {
+		dn200_mode_init(priv, rx_q->dma_rx,
+				rx_q->dma_rx_phy, priv->dma_rx_size);
+	}
+
+	return 0;
+}
+
+static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	int ret;
+
+	/* RX INITIALIZATION */
+	netif_dbg(priv, probe, priv->dev,
+		  "SKB addresses:\nskb\t\tskb data\tdma data\n");
+
+	for (queue = 0; queue < rx_count; queue++) {
+		ret = __init_dma_rx_desc_rings(priv, queue, flags);
+		if (ret)
+			goto err_init_rx_buffers;
+	}
+
+	return 0;
+
+err_init_rx_buffers:
+	while (queue >= 0) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		rx_q->buf_alloc_num = 0;
+		if (queue == 0)
+			break;
+
+		queue--;
+	}
+
+	return ret;
+}
+
+/**
+ * __init_dma_tx_desc_rings - init the TX descriptor ring (per queue)
+ * @priv: driver private structure
+ * @queue : TX queue index
+ * Description: this function initializes the DMA TX descriptors
+ * and allocates the socket buffers. It supports the chained and ring
+ * modes.
+ */
+static int __init_dma_tx_desc_rings(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	int i;
+	struct dma_desc *p;
+
+	netif_dbg(priv, probe, priv->dev,
+		  "(%s) dma_tx_phy=0x%08x\n", __func__, (u32)tx_q->dma_tx_phy);
+	/* Setup the chained descriptor addresses */
+	if (priv->mode == DN200_CHAIN_MODE) {
+		if (!(tx_q->tbs & DN200_TBS_AVAIL))
+			dn200_mode_init(priv, tx_q->dma_tx,
+					tx_q->dma_tx_phy, priv->dma_tx_size);
+	}
+	for (i = 0; i < priv->dma_tx_size; i++) {
+		p = tx_q->dma_tx + i;
+
+		dn200_clear_desc(priv, p);
+
+		tx_q->tx_skbuff_dma[i].buf = 0;
+		tx_q->tx_skbuff_dma[i].map_as_page = false;
+		tx_q->tx_skbuff_dma[i].len = 0;
+		tx_q->tx_skbuff_dma[i].last_segment = false;
+		tx_q->tx_skbuff_dma[i].iatu_ref_ptr = NULL;
+		tx_q->tx_skbuff[i] = NULL;
+	}
+
+	tx_q->dirty_tx = 0;
+	tx_q->cur_tx = 0;
+	tx_q->mss = 0;
+	tx_q->next_to_watch = -1;
+	atomic_set(&tx_q->txtimer_running, 0);
+	netdev_tx_reset_queue(netdev_get_tx_queue(priv->dev, queue));
+
+	return 0;
+}
+
+static int init_dma_tx_desc_rings(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 tx_queue_cnt;
+	u32 queue;
+
+	tx_queue_cnt = priv->plat->tx_queues_to_use;
+
+	for (queue = 0; queue < tx_queue_cnt; queue++)
+		__init_dma_tx_desc_rings(priv, queue);
+
+	return 0;
+}
+
+/**
+ * init_dma_desc_rings - init the RX/TX descriptor rings
+ * @dev: net device structure
+ * @flags: gfp flag.
+ * Description: this function initializes the DMA RX/TX descriptors
+ * and allocates the socket buffers. It supports the chained and ring
+ * modes.
+ */
+static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ret = init_dma_rx_desc_rings(dev, flags);
+	if (ret)
+		return ret;
+
+	ret = init_dma_tx_desc_rings(dev);
+
+	dn200_clear_descriptors(priv);
+
+	if (netif_msg_hw(priv))
+		dn200_display_rings(priv);
+
+	return ret;
+}
+
+/**
+ * dma_free_tx_skbufs - free TX dma buffers
+ * @priv: private structure
+ * @queue: TX queue index
+ */
+static void dma_free_tx_skbufs(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	int i;
+
+	tx_q->xsk_frames_done = 0;
+
+	for (i = 0; i < priv->dma_tx_size; i++)
+		dn200_free_tx_buffer(priv, queue, i);
+}
+
+/**
+ * __free_dma_rx_desc_resources - free RX dma desc resources (per queue)
+ * @priv: private structure
+ * @queue: RX queue index
+ */
+static void __free_dma_rx_desc_resources(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+	if (rx_q->state_saved) {
+		rx_q->state_saved = false;
+		dev_kfree_skb(rx_q->state.skb);
+		rx_q->state.skb = NULL;
+	}
+	rx_q->buf_alloc_num = 0;
+	/* Free DMA regions of consistent memory previously allocated */
+	dma_free_coherent(priv->device,
+			  priv->dma_rx_size * sizeof(struct dma_desc),
+			  rx_q->dma_rx, rx_q->origin_dma_rx_phy);
+
+	kfree(rx_q->buf_pool);
+	rx_q->buf_pool = NULL;
+}
+
+static void free_dma_rx_desc_resources(struct dn200_priv *priv)
+{
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+
+	/* Free RX queue resources */
+	for (queue = 0; queue < rx_count; queue++)
+		__free_dma_rx_desc_resources(priv, queue);
+}
+
+/**
+ * __free_dma_tx_desc_resources - free TX dma desc resources (per queue)
+ * @priv: private structure
+ * @queue: TX queue index
+ */
+static void __free_dma_tx_desc_resources(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	size_t size;
+	void *addr;
+
+	/* Release the DMA TX socket buffers */
+	dma_free_tx_skbufs(priv, queue);
+	size = sizeof(struct dma_desc);
+	addr = tx_q->dma_tx;
+	size *= priv->dma_tx_size;
+
+	if (tx_q->origin_dma_tx_phy)
+		dma_free_coherent(priv->device, size, addr,
+				  tx_q->origin_dma_tx_phy);
+
+	kfree(tx_q->tx_dma32_bufs);
+	tx_q->tx_dma32_bufs = NULL;
+	kfree(tx_q->tx_skbuff_dma);
+	tx_q->tx_skbuff_dma = NULL;
+	kfree(tx_q->tx_skbuff);
+	tx_q->tx_skbuff = NULL;
+}
+
+static void free_dma_tx_desc_resources(struct dn200_priv *priv)
+{
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+
+	/* Free TX queue resources */
+	for (queue = 0; queue < tx_count; queue++)
+		__free_dma_tx_desc_resources(priv, queue);
+}
+
+static inline void *dma32_alloc_coherent(struct device *device, size_t size,
+					 dma_addr_t *dma_handle, gfp_t gfp)
+{
+	int ret;
+	void *addr;
+
+	ret = dma_set_mask_and_coherent(device, DMA_BIT_MASK(32));
+	if (ret) {
+		dev_err(device, "Failed to set DMA 32 bit Mask\n");
+		return NULL;
+	}
+	addr = dma_alloc_coherent(device, size, dma_handle, gfp);
+	if (!addr)
+		return NULL;
+
+	ret = dma_set_mask_and_coherent(device, DMA_BIT_MASK(64));
+	if (ret)
+		dev_err(device, "Failed to set DMA 64 bit Mask\n");
+
+	return addr;
+}
+
+/**
+ * __alloc_dma_rx_desc_resources - alloc RX resources (per queue).
+ * @priv: private structure
+ * @queue: RX queue index
+ * Description: according to which descriptor can be used (extend or basic)
+ * this function allocates the resources for TX and RX paths. In case of
+ * reception, for example, it pre-allocated the RX socket buffer in order to
+ * allow zero-copy mechanism.
+ */
+static int __alloc_dma_rx_desc_resources(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned int napi_id;
+
+	rx_q->queue_index = queue;
+	rx_q->priv_data = priv;
+	rx_q->buf_pool =
+		kcalloc(priv->dma_rx_size, sizeof(*rx_q->buf_pool), GFP_KERNEL);
+	if (!rx_q->buf_pool)
+		return -ENOMEM;
+
+	rx_q->dma_rx = NULL;
+	rx_q->dma_rx = dma_alloc_coherent(priv->device,
+		priv->dma_rx_size * sizeof(struct dma_desc),
+		&rx_q->origin_dma_rx_phy, GFP_KERNEL);
+	if (!rx_q->dma_rx)
+		return -ENOMEM;
+	if (dn200_rx_iatu_find(rx_q->origin_dma_rx_phy, priv,
+			       &rx_q->dma_rx_phy) < 0) {
+		dev_dbg(priv->device,
+			"%s %d alloc rx desc failed!!! queue %d dma_addr %#llx\n",
+			__func__, __LINE__, queue, rx_q->origin_dma_rx_phy);
+		if (rx_q->dma_rx)
+			dma_free_coherent(priv->device,
+				priv->dma_rx_size * sizeof(struct dma_desc),
+				rx_q->dma_rx, rx_q->origin_dma_rx_phy);
+
+		rx_q->dma_rx = dma32_alloc_coherent(priv->device,
+			priv->dma_rx_size * sizeof(struct dma_desc),
+			&rx_q->origin_dma_rx_phy, GFP_KERNEL);
+		if (!rx_q->dma_rx)
+			return -ENOMEM;
+		if (dn200_rx_iatu_find(rx_q->origin_dma_rx_phy, priv,
+				       &rx_q->dma_rx_phy) < 0) {
+			dev_err(priv->device,
+				"%s %d alloc dma32 rx desc failed!!! queue %d dma_addr %#llx\n",
+				__func__, __LINE__, queue,
+				rx_q->origin_dma_rx_phy);
+			return -ENOMEM;
+		}
+	}
+
+	if (queue < priv->plat->rx_queues_to_use) {
+		if (priv->txrx_itr_combined)
+			napi_id = ch->agg_napi.napi_id;
+		else
+			napi_id = ch->rx_napi.napi_id;
+	}
+	ch->rx_q = rx_q;
+	return 0;
+}
+
+static int alloc_dma_rx_desc_resources(struct dn200_priv *priv)
+{
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	int ret;
+
+	/* RX queues buffers and DMA */
+	for (queue = 0; queue < rx_count; queue++) {
+		ret = __alloc_dma_rx_desc_resources(priv, queue);
+		if (ret)
+			goto err_dma;
+	}
+	return 0;
+
+err_dma:
+	free_dma_rx_desc_resources(priv);
+
+	return ret;
+}
+
+/**
+ * __alloc_dma_tx_desc_resources - alloc TX resources (per queue).
+ * @priv: private structure
+ * @queue: TX queue index
+ * Description: according to which descriptor can be used (extend or basic)
+ * this function allocates the resources for TX and RX paths. In case of
+ * reception, for example, it pre-allocated the RX socket buffer in order to
+ * allow zero-copy mechanism.
+ */
+static int __alloc_dma_tx_desc_resources(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	size_t size;
+	void *addr = NULL;
+
+	tx_q->queue_index = queue;
+	tx_q->priv_data = priv;
+
+	/*alloc tx bufs array to manage dma32 buffers */
+	tx_q->tx_dma32_bufs = kcalloc(priv->dma_tx_size,
+				      sizeof(*tx_q->tx_dma32_bufs), GFP_KERNEL);
+	if (!tx_q->tx_dma32_bufs)
+		return -ENOMEM;
+
+	tx_q->tx_skbuff_dma = kcalloc(priv->dma_tx_size,
+				      sizeof(*tx_q->tx_skbuff_dma), GFP_KERNEL);
+	if (!tx_q->tx_skbuff_dma)
+		return -ENOMEM;
+
+	tx_q->tx_skbuff = kcalloc(priv->dma_tx_size,
+				  sizeof(struct sk_buff *), GFP_KERNEL);
+	if (!tx_q->tx_skbuff)
+		return -ENOMEM;
+
+	size = sizeof(struct dma_desc);
+
+	size *= priv->dma_tx_size;
+
+	addr = dma_alloc_coherent(priv->device, size,
+				  &tx_q->origin_dma_tx_phy, GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+	if (dn200_rx_iatu_find(tx_q->origin_dma_tx_phy, priv, &tx_q->dma_tx_phy)
+	    < 0) {
+		dev_dbg(priv->device,
+			"%s alloc tx desc failed!!! queue %d dma_addr %#llx\n",
+			__func__, queue, tx_q->origin_dma_tx_phy);
+		if (addr)
+			dma_free_coherent(priv->device, size, addr,
+					  tx_q->origin_dma_tx_phy);
+		addr =
+		    dma32_alloc_coherent(priv->device, size,
+					 &tx_q->origin_dma_tx_phy, GFP_KERNEL);
+		if (!addr)
+			return -ENOMEM;
+		if (dn200_rx_iatu_find
+		    (tx_q->origin_dma_tx_phy, priv, &tx_q->dma_tx_phy) < 0) {
+			dev_err(priv->device,
+				"%s alloc dma32 tx desc failed!!! queue %d dma_addr %#llx\n",
+				__func__, queue, tx_q->origin_dma_tx_phy);
+			return -ENOMEM;
+		}
+	}
+
+	tx_q->dma_tx = addr;
+
+	return 0;
+}
+
+static int alloc_dma_tx_desc_resources(struct dn200_priv *priv)
+{
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+	int ret;
+
+	/* TX queues buffers and DMA */
+	for (queue = 0; queue < tx_count; queue++) {
+		ret = __alloc_dma_tx_desc_resources(priv, queue);
+		if (ret)
+			goto err_dma;
+	}
+	return 0;
+
+err_dma:
+	free_dma_tx_desc_resources(priv);
+	return ret;
+}
+
+/**
+ * alloc_dma_desc_resources - alloc TX/RX resources.
+ * @priv: private structure
+ * Description: according to which descriptor can be used (extend or basic)
+ * this function allocates the resources for TX and RX paths. In case of
+ * reception, for example, it pre-allocated the RX socket buffer in order to
+ * allow zero-copy mechanism.
+ */
+static int alloc_dma_desc_resources(struct dn200_priv *priv)
+{
+	int ret;
+
+	ret = dn200_rx_pool_setup(priv);
+	if (ret)
+		goto rx_pool_err;
+	/* RX Allocation */
+	ret = alloc_dma_rx_desc_resources(priv);
+	if (ret)
+		goto rx_desc_err;
+	ret = alloc_dma_tx_desc_resources(priv);
+	if (ret)
+		goto tx_desc_err;
+	return 0;
+
+tx_desc_err:
+	free_dma_rx_desc_resources(priv);
+rx_desc_err:
+rx_pool_err:
+	dn200_rx_pool_destory(priv);
+	return ret;
+}
+
+/**
+ * free_dma_desc_resources - free dma desc resources
+ * @priv: private structure
+ */
+static void free_dma_desc_resources(struct dn200_priv *priv)
+{
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 i;
+	struct dn200_tx_queue *tx_q;
+	struct dn200_rx_queue *rx_q;
+
+	/* Release the DMA TX socket buffers */
+	free_dma_tx_desc_resources(priv);
+
+	/* Release the DMA RX socket buffers later
+	 * to ensure all pending XDP_TX buffers are returned.
+	 */
+	free_dma_rx_desc_resources(priv);
+	dn200_rx_pool_destory(priv);
+
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		dn200_iatu_uninit(priv);
+	for (i = 0; i < tx_count; i++) {
+		tx_q = &priv->tx_queue[i];
+		memset(tx_q, 0, sizeof(struct dn200_tx_queue));
+	}
+	for (i = 0; i < rx_count; i++) {
+		rx_q = &priv->rx_queue[i];
+		memset(rx_q, 0, sizeof(struct dn200_rx_queue));
+	}
+}
+
+/**
+ *  dn200_mac_enable_rx_queues - Enable MAC rx queues
+ *  @priv: driver private structure
+ *  Description: It is used for enabling the rx queues in the MAC
+ */
+static void dn200_mac_enable_rx_queues(struct dn200_priv *priv)
+{
+	u32 queue;
+	u8 mode;
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		mode = priv->plat->rx_queues_cfg[queue].mode_to_use;
+		dn200_rx_queue_enable(priv, priv->hw, mode, queue);
+	}
+
+	if (!PRIV_IS_VF(priv)) {
+		/* broadcast & mutlicast put to last mtl queue and copy to all dma channels */
+		queue = DN200_LAST_QUEUE(priv);
+		dn200_rx_queue_enable(priv, priv->hw, MTL_QUEUE_DCB, queue);
+	}
+}
+
+/**
+ * dn200_start_rx_dma - start RX DMA channel
+ * @priv: driver private structure
+ * @chan: RX channel index
+ * Description:
+ * This starts a RX DMA channel
+ */
+static void dn200_start_rx_dma(struct dn200_priv *priv, u32 chan)
+{
+	netdev_dbg(priv->dev, "DMA RX processes started in channel %d\n", chan);
+	dn200_start_rx(priv, priv->ioaddr, chan, priv->hw);
+}
+
+/**
+ * dn200_start_tx_dma - start TX DMA channel
+ * @priv: driver private structure
+ * @chan: TX channel index
+ * Description:
+ * This starts a TX DMA channel
+ */
+static void dn200_start_tx_dma(struct dn200_priv *priv, u32 chan)
+{
+	netdev_dbg(priv->dev, "DMA TX processes started in channel %d\n", chan);
+	dn200_start_tx(priv, priv->ioaddr, chan, priv->hw);
+}
+
+/**
+ * dn200_stop_rx_dma - stop RX DMA channel
+ * @priv: driver private structure
+ * @chan: RX channel index
+ * Description:
+ * This stops a RX DMA channel
+ */
+static void dn200_stop_rx_dma(struct dn200_priv *priv, u32 chan)
+{
+	netdev_dbg(priv->dev, "DMA RX processes stopped in channel %d\n", chan);
+	dn200_stop_rx(priv, priv->ioaddr, chan, priv->hw);
+}
+
+/**
+ * dn200_stop_tx_dma - stop TX DMA channel
+ * @priv: driver private structure
+ * @chan: TX channel index
+ * Description:
+ * This stops a TX DMA channel
+ */
+static void dn200_stop_tx_dma(struct dn200_priv *priv, u32 chan)
+{
+	netdev_dbg(priv->dev, "DMA TX processes stopped in channel %d\n", chan);
+	dn200_stop_tx(priv, priv->ioaddr, chan, priv->hw);
+}
+
+static void dn200_enable_all_dma_irq(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 dma_csr_ch = max(rx_channels_count, tx_channels_count);
+	u32 chan;
+
+	for (chan = 0; chan < dma_csr_ch; chan++) {
+		struct dn200_channel *ch = &priv->channel[chan];
+		unsigned long flags;
+
+		spin_lock_irqsave(&ch->lock, flags);
+		dn200_enable_dma_irq(priv, priv->ioaddr, chan, 1, 1, priv->hw);
+		spin_unlock_irqrestore(&ch->lock, flags);
+	}
+}
+
+/**
+ * dn200_start_all_dma - start all RX and TX DMA channels
+ * @priv: driver private structure
+ * Description:
+ * This starts all the RX and TX DMA channels
+ */
+void dn200_start_all_dma(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 chan = 0;
+
+	for (chan = 0; chan < rx_channels_count; chan++)
+		dn200_start_rx_dma(priv, chan);
+
+	for (chan = 0; chan < tx_channels_count; chan++)
+		dn200_start_tx_dma(priv, chan);
+}
+
+/**
+ * dn200_stop_all_dma - stop all RX and TX DMA channels
+ * @priv: driver private structure
+ * Description:
+ * This stops the RX and TX DMA channels
+ */
+void dn200_stop_all_dma(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 chan = 0;
+
+	for (chan = 0; chan < rx_channels_count; chan++)
+		dn200_stop_rx_dma(priv, chan);
+
+	for (chan = 0; chan < tx_channels_count; chan++)
+		dn200_stop_tx_dma(priv, chan);
+}
+
+/**
+ * dn200_stop_vf_dma - stop all RX and TX DMA channels
+ * @priv: driver private structure
+ * Description:
+ * This stops the RX and TX DMA channels
+ */
+static void dn200_stop_vf_dma(struct dn200_priv *priv, u8 vf_num)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 chan = 0, chan_start = 0;
+
+	if (vf_num) {
+		chan_start = priv->plat_ex->pf.vfs[vf_num - 1].rx_queue_start;
+		rx_channels_count = priv->plat_ex->pf.vfs[vf_num - 1].rx_queues_num;
+		tx_channels_count = priv->plat_ex->pf.vfs[vf_num - 1].tx_queues_num;
+	} else {
+		chan_start = 0;
+	}
+	for (chan = 0; chan < rx_channels_count; chan++)
+		dn200_stop_rx_dma(priv, chan + chan_start);
+
+	for (chan = 0; chan < tx_channels_count; chan++)
+		dn200_stop_tx_dma(priv, chan + chan_start);
+}
+
+static void dn200_set_dma_operation_mode(struct dn200_priv *priv, u32 txmode,
+					 u32 rxmode, u32 chan, u8 tc,
+					 enum dn200_txrx_mode_dir set_dir);
+/**
+ *  dn200_dma_operation_mode - HW DMA operation mode
+ *  @priv: driver private structure
+ *  Description: it is used for configuring the DMA operation mode register in
+ *  order to program the tx/rx DMA thresholds or Store-And-Forward mode.
+ */
+void dn200_dma_operation_mode(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 txmode = 0;
+	u32 rxmode = 0;
+	u32 chan = 0;
+
+	if (priv->plat->force_sf_dma_mode || priv->plat->tx_coe) {
+		/* In case of GMAC, SF mode can be enabled
+		 * to perform the TX COE in HW. This depends on:
+		 * 1) TX COE if actually supported
+		 * 2) There is no bugged Jumbo frame support
+		 *    that needs to not insert csum in the TDES.
+		 */
+		txmode = SF_DMA_MODE;
+		/* clear rxmode store and forward (set to 0) and rxpbl=2,
+		 * almost resolve the overflow issue
+		 */
+		rxmode = SF_DMA_MODE; /*SF_DMA_MODE; */
+	} else {
+		txmode = 64;
+		rxmode = 0;	/*SF_DMA_MODE; */
+	}
+
+	/* configure all channels */
+	for (chan = 0; chan < rx_channels_count; chan++) {
+		dn200_set_dma_operation_mode(priv, txmode, rxmode, chan, chan,
+					     DN200_SET_RX_MODE);
+
+		dn200_set_dma_bfsize(priv, priv->ioaddr,
+				     priv->dma_buf_sz, chan, priv->hw);
+	}
+	for (chan = 0; chan < tx_channels_count; chan++) {
+		dn200_set_dma_operation_mode(priv, txmode, rxmode, chan, chan,
+					     DN200_SET_TX_MODE);
+	}
+	/* pf set dma operation mode for all queues(vfs),
+	 * when other vfs are tx/rx flow,
+	 * one vf set self tx mode will cause tx timeout and pcie err
+	 */
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		int vf_queue = priv->plat_ex->default_tx_queue_num + priv->plat_ex->max_vfs;
+
+		for (chan = priv->plat_ex->default_tx_queue_num; chan < vf_queue;
+			chan++) {
+			dev_dbg(priv->device,
+				"%s, %d, chan:%d, txmode:%d, rxmode:%d\n", __func__,
+				__LINE__, chan, txmode, rxmode);
+			/* To ensure all vfs run in same bandwidth, the method is:
+			 * 1. map all vfs mtl queue to tc 0
+			 * 2. all vfs use same weight confiugred in tc 0
+			 */
+			dn200_set_dma_operation_mode(priv, txmode, rxmode, chan, 0,
+							DN200_SET_TX_MODE);
+		}
+	}
+}
+
+
+static inline void dn200_unmap_txbuff(struct dn200_priv *priv,
+				      struct dn200_tx_queue *tx_q,
+				      unsigned int entry)
+{
+	if (likely(tx_q->tx_skbuff_dma[entry].buf)) {
+		if (tx_q->tx_skbuff_dma[entry].map_as_page)
+			dma_unmap_page(priv->device,
+				       tx_q->tx_skbuff_dma[entry].buf,
+				       tx_q->tx_skbuff_dma[entry].len,
+				       DMA_TO_DEVICE);
+		else
+			dma_unmap_single(priv->device,
+					 tx_q->tx_skbuff_dma[entry].buf,
+					 tx_q->tx_skbuff_dma[entry].len,
+					 DMA_TO_DEVICE);
+		tx_q->tx_skbuff_dma[entry].buf = 0;
+		tx_q->tx_skbuff_dma[entry].len = 0;
+		tx_q->tx_skbuff_dma[entry].map_as_page = false;
+		if (tx_q->tx_skbuff_dma[entry].iatu_ref_ptr) {
+			atomic_sub(1, tx_q->tx_skbuff_dma[entry].iatu_ref_ptr);
+			tx_q->tx_skbuff_dma[entry].iatu_ref_ptr = NULL;
+		}
+	}
+}
+
+/**
+ * dn200_sw_tx_clean - to manage the transmission completion
+ * @priv: driver private structure
+ * @budget: napi budget limiting this functions packet handling
+ * @queue: TX queue index
+ * Description: it reclaims the transmit resources after transmission completes.
+ */
+static int dn200_sw_tx_clean(struct dn200_priv *priv, int budget, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	unsigned int bytes_compl = 0, pkts_compl = 0;
+	unsigned int entry, xmits = 0, count = 0;
+	unsigned int cur_tx = 0;
+
+	cur_tx = tx_q->cur_tx;
+	priv->xstats.tx_clean++;
+	tx_q->xsk_frames_done = 0;
+	entry = tx_q->dirty_tx;
+
+	/* Try to clean all TX complete frame in 1 shot */
+	while ((entry != cur_tx) && count < priv->dma_tx_size
+	       /*&& count <= budget*/) {
+		struct sk_buff *skb;
+		struct dma_desc *p;
+		int status;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+		if (tx_q->tx_skbuff_dma[entry].buf_type == DN200_TXBUF_T_SKB)
+			skb = tx_q->tx_skbuff[entry];
+		else
+			skb = NULL;
+		p = tx_q->dma_tx + entry;
+		status = dn200_tx_status(priv, &priv->dev->stats,
+					 &priv->xstats, p, priv->ioaddr);
+		count++;
+		/* Make sure descriptor fields are read after reading
+		 * the own bit.
+		 */
+		dma_rmb();
+
+		/* Just consider the last segment and ... */
+		if (likely(!(status & tx_not_ls))) {
+			/* ... verify the status error condition */
+			if (unlikely(status & tx_err)) {
+				priv->dev->stats.tx_errors++;
+			} else {
+				priv->dev->stats.tx_packets++;
+				priv->xstats.tx_pkt_n++;
+				priv->xstats.txq_stats[queue].tx_pkt_n++;
+			}
+			if (skb)
+				dn200_get_tx_hwtstamp(priv, p, skb);
+		}
+
+		dn200_unmap_txbuff(priv, tx_q, entry);
+		dn200_free_dma32_tx_buffer(priv, tx_q, entry);
+		dn200_clean_desc3(priv, tx_q, p);
+
+		tx_q->tx_skbuff_dma[entry].last_segment = false;
+		tx_q->tx_skbuff_dma[entry].is_jumbo = false;
+
+		if (tx_q->tx_skbuff_dma[entry].buf_type == DN200_TXBUF_T_SKB) {
+			if (likely(skb)) {
+				pkts_compl++;
+				bytes_compl += skb->len;
+				napi_consume_skb(skb, budget);
+				tx_q->tx_skbuff[entry] = NULL;
+			}
+		}
+
+		dn200_release_tx_desc(priv, p, priv->mode);
+
+		entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+	}
+	tx_q->dirty_tx = entry;
+	netdev_tx_completed_queue(netdev_get_tx_queue(priv->dev, queue),
+				  pkts_compl, bytes_compl);
+
+	if (unlikely(netif_tx_queue_stopped(netdev_get_tx_queue(priv->dev, queue))) &&
+	    dn200_tx_avail(priv, queue) > DN200_TX_THRESH(priv)) {
+		netif_dbg(priv, tx_done, priv->dev,
+			  "%s: restart transmit\n", __func__);
+		netif_tx_wake_queue(netdev_get_tx_queue(priv->dev, queue));
+	}
+
+	return max(count, xmits);
+}
+
+/**
+ * dn200_tx_clean - to manage the transmission completion
+ * @priv: driver private structure
+ * @budget: napi budget limiting this functions packet handling
+ * @queue: TX queue index
+ * Description: it reclaims the transmit resources after transmission completes.
+ */
+int dn200_tx_clean(struct dn200_priv *priv, int budget, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	unsigned int bytes_compl = 0, pkts_compl = 0;
+	unsigned int entry, xmits = 0, count = 0;
+	int next_to_watch = tx_q->next_to_watch;
+
+	/*The read barrier is used to protect next_to_watch*/
+	smp_rmb();
+	priv->xstats.tx_clean++;
+	tx_q->xsk_frames_done = 0;
+	entry = tx_q->dirty_tx;
+
+	/* Try to clean all TX complete frame in 1 shot */
+	while ((entry != next_to_watch) && count < priv->dma_tx_size
+	       /*&& count <= budget*/) {
+		struct sk_buff *skb;
+		struct dma_desc *p;
+		int status;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+		if (tx_q->tx_skbuff_dma[entry].buf_type == DN200_TXBUF_T_SKB)
+			skb = tx_q->tx_skbuff[entry];
+		else
+			skb = NULL;
+		p = tx_q->dma_tx + entry;
+		status = dn200_tx_status(priv, &priv->dev->stats,
+					 &priv->xstats, p, priv->ioaddr);
+
+		/* Check if the descriptor is owned by the DMA */
+		if (unlikely(status & tx_dma_own))
+			break;
+
+		count++;
+		/* Make sure descriptor fields are read after reading
+		 * the own bit.
+		 */
+		dma_rmb();
+
+		/* Just consider the last segment and ... */
+		if (likely(!(status & tx_not_ls))) {
+			/* ... verify the status error condition */
+			if (unlikely(status & tx_err)) {
+				priv->dev->stats.tx_errors++;
+			} else {
+				priv->dev->stats.tx_packets++;
+				priv->xstats.tx_pkt_n++;
+				priv->xstats.txq_stats[queue].tx_pkt_n++;
+			}
+			if (skb)
+				dn200_get_tx_hwtstamp(priv, p, skb);
+		}
+
+		dn200_unmap_txbuff(priv, tx_q, entry);
+		dn200_free_dma32_tx_buffer(priv, tx_q, entry);
+		dn200_clean_desc3(priv, tx_q, p);
+
+		tx_q->tx_skbuff_dma[entry].last_segment = false;
+		tx_q->tx_skbuff_dma[entry].is_jumbo = false;
+
+		if (tx_q->tx_skbuff_dma[entry].buf_type == DN200_TXBUF_T_SKB) {
+			if (likely(skb)) {
+				pkts_compl++;
+				bytes_compl += skb->len;
+				napi_consume_skb(skb, budget);
+				tx_q->tx_skbuff[entry] = NULL;
+			}
+		}
+
+		dn200_release_tx_desc(priv, p, priv->mode);
+
+		entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+	}
+	tx_q->dirty_tx = entry;
+	netdev_tx_completed_queue(netdev_get_tx_queue(priv->dev, queue),
+				  pkts_compl, bytes_compl);
+
+	if (unlikely(netif_tx_queue_stopped(netdev_get_tx_queue(priv->dev, queue))) &&
+	    dn200_tx_avail(priv, queue) > DN200_TX_THRESH(priv)) {
+		netif_dbg(priv, tx_done, priv->dev,
+			  "%s: restart transmit\n", __func__);
+		netif_tx_wake_queue(netdev_get_tx_queue(priv->dev, queue));
+	}
+	if (priv->eee_enabled && !priv->tx_path_in_lpi_mode &&
+	    priv->eee_sw_timer_en) {
+		if (dn200_enable_eee_mode(priv))
+			mod_timer(&priv->eee_ctrl_timer,
+				  DN200_LPI_T(priv->tx_lpi_timer));
+	}
+
+	/* Combine decisions from TX clean and XSK TX */
+	if (tx_q->cur_tx != tx_q->dirty_tx) {
+		/* We still have pending packets, let's call for a new scheduling */
+		hrtimer_start(&tx_q->txtimer,
+					DN200_COAL_TIMER(priv->tx_coal_timer[queue] ? : 1),
+					HRTIMER_MODE_REL);
+		tx_q->txtimer_need_sch = true;
+	} else {
+		atomic_set(&tx_q->txtimer_running, 0);
+		/*write barrier to protect shared value*/
+		smp_wmb();
+	}
+
+	return max(count, xmits);
+}
+
+/**
+ * dn200_tx_iatu_ref_clean - to manage the transmission completion
+ */
+void dn200_tx_iatu_ref_clean(struct dn200_priv *priv,
+			     struct dn200_tx_queue *tx_q)
+{
+	unsigned int entry, count = 0;
+	struct dma_desc *p;
+	int status;
+
+	entry = tx_q->dirty_tx;
+
+	/* Try to clean all TX complete frame in 1 shot */
+	while ((entry != tx_q->cur_tx) && count < priv->dma_tx_size) {
+		p = tx_q->dma_tx + entry;
+		status = dn200_tx_status(priv, &priv->dev->stats,
+					 &priv->xstats, p, priv->ioaddr);
+		/* Check if the descriptor is owned by the DMA */
+		if (unlikely(status & tx_dma_own))
+			break;
+
+		count++;
+
+		/*Only clean iatu ref count, other resource wait soft irq */
+		if (tx_q->tx_skbuff_dma[entry].iatu_ref_ptr) {
+			atomic_sub(1, tx_q->tx_skbuff_dma[entry].iatu_ref_ptr);
+			tx_q->tx_skbuff_dma[entry].iatu_ref_ptr = NULL;
+		}
+		if (tx_q->tx_dma32_bufs[entry].iatu_ref_ptr) {
+			atomic_sub(1, tx_q->tx_dma32_bufs[entry].iatu_ref_ptr);
+			tx_q->tx_dma32_bufs[entry].iatu_ref_ptr = NULL;
+		}
+
+		entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+	}
+}
+
+/**
+ * dn200_tx_err - to manage the tx error
+ * @priv: driver private structure
+ * @chan: channel index
+ * Description: it cleans the descriptors and restarts the transmission
+ * in case of transmission errors.
+ */
+static void dn200_tx_err(struct dn200_priv *priv, u32 chan)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[chan];
+
+	netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, chan));
+
+	dn200_stop_tx_dma(priv, chan);
+	dma_free_tx_skbufs(priv, chan);
+	dn200_clear_tx_descriptors(priv, chan);
+	tx_q->dirty_tx = 0;
+	tx_q->cur_tx = 0;
+	tx_q->mss = 0;
+	netdev_tx_reset_queue(netdev_get_tx_queue(priv->dev, chan));
+	dn200_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+			   tx_q->dma_tx_phy, chan, priv->hw);
+	dn200_start_tx_dma(priv, chan);
+
+	priv->dev->stats.tx_errors++;
+	netif_tx_wake_queue(netdev_get_tx_queue(priv->dev, chan));
+}
+
+/**
+ *  dn200_set_dma_operation_mode - Set DMA operation mode by channel
+ *  @priv: driver private structure
+ *  @txmode: TX operating mode
+ *  @rxmode: RX operating mode
+ *  @chan: channel index
+ *  Description: it is used for configuring of the DMA operation mode in
+ *  runtime in order to program the tx/rx DMA thresholds or Store-And-Forward
+ *  mode.
+ */
+static void dn200_set_dma_operation_mode(struct dn200_priv *priv, u32 txmode,
+					 u32 rxmode, u32 chan, u8 tc,
+					 enum dn200_txrx_mode_dir set_dir)
+{
+	u8 rxqmode = MTL_QUEUE_DCB;
+	u8 txqmode = MTL_QUEUE_DCB;
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	int rxfifosz = priv->plat->rx_fifo_size;
+	int txfifosz = priv->plat->tx_fifo_size;
+
+	if (PRIV_IS_VF(priv))
+		return;
+	/* just support 7 tcs */
+	if (tc > 0x7) {
+		dev_info(priv->device, "%s, %d, change input tc:%d to %d\n",
+			 __func__, __LINE__, tc, tc & 0x7);
+		tc &= 0x7;
+	}
+
+	if (rxfifosz == 0)
+		rxfifosz = priv->dma_cap.rx_fifo_size;
+	if (txfifosz == 0)
+		txfifosz = priv->dma_cap.tx_fifo_size;
+
+	/* Adjust for real per queue fifo size */
+	rxfifosz /= rx_channels_count;
+	if (tx_channels_count > 1)
+		txfifosz = (txfifosz - priv->tx_fifo_queue_0) / (tx_channels_count - 1);
+
+	if (set_dir & DN200_SET_TX_MODE) {
+		/* queue 0 used for pf jumbo frame, give it larger fifo size,
+		 * vf don't support jumbo
+		 */
+		if (chan == 0) {
+			if (tx_channels_count != 1)
+				txfifosz = priv->tx_fifo_queue_0;
+			dn200_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
+						 txfifosz, txqmode,
+						 tc, priv->hw);
+		} else {
+			dev_dbg(priv->device,
+				"%s, %d, chan:%d, txfifosz:%d, txmode:%d, txqmode:%d\n",
+				__func__, __LINE__, chan, txfifosz, txmode,
+				txqmode);
+			if (chan >= priv->plat_ex->default_tx_queue_num) /*set vf fifo*/
+				txfifosz = priv->vf_tx_fifo_size;
+			dn200_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
+					  txfifosz, txqmode, tc, priv->hw);
+		}
+	}
+
+	if (set_dir & DN200_SET_RX_MODE) {
+		/* queue 15 used for untag packets and tag prio 0 & 1 packets,
+		 * so give it larger fifo size
+		 */
+		dn200_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, priv->mtl_queue_fifo_avg,
+				  rxqmode, priv->hw);
+
+		/* DDS fifo size(DSS use last queue to copy broadcast &
+		 * mutlicast pkts to all channel)
+		 */
+		if (!PRIV_IS_PUREPF(priv)) {
+			chan = DN200_LAST_QUEUE(priv);
+			dn200_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, priv->mtl_queue_fifo_more,
+					rxqmode, priv->hw);
+		}
+	}
+}
+
+static bool dn200_safety_feat_interrupt(struct dn200_priv *priv)
+{
+	int ret;
+
+	ret = dn200_safety_feat_irq_status(priv, priv->dev,
+					   priv->ioaddr, priv->dma_cap.asp,
+					   &priv->sstats);
+	if (ret && (ret != -EINVAL)) {
+		dn200_global_err(priv, DN200_SAFETY_FEAT_INT);
+		return true;
+	}
+
+	return false;
+}
+
+static int dn200_napi_check(struct dn200_priv *priv, u32 chan, u32 dir)
+{
+	int status = 0;
+	struct dn200_channel *ch = &priv->channel[chan];
+	struct napi_struct *rx_napi;
+	struct napi_struct *tx_napi;
+	struct napi_struct *agg_napi;
+	struct dn200_itr_info *rx_intr;
+	/*unsigned long flags; */
+
+	if (!priv->plat->multi_msi_en) {
+		status = dn200_dma_interrupt_status(priv, priv->ioaddr,
+						    &priv->xstats, chan, dir,
+						    priv->hw);
+	} else {
+		if (dir == DMA_DIR_RX)
+			status = handle_rx;
+		else if (dir == DMA_DIR_TX)
+			status = handle_tx;
+		else if (dir == DMA_DIR_RXTX)
+			status = handle_rx | handle_tx;
+	}
+
+	if ((status == (handle_rx | handle_tx)) &&
+	    (chan < priv->plat->rx_queues_to_use &&
+			 chan < priv->plat->tx_queues_to_use) &&
+				 priv->txrx_itr_combined) {
+		agg_napi = &ch->agg_napi;
+		if (napi_schedule_prep(agg_napi))
+			__napi_schedule(agg_napi);
+	} else {
+		rx_napi = &ch->rx_napi;
+		tx_napi = &ch->tx_napi;
+		if ((status & handle_rx) &&
+		    chan < priv->plat->rx_queues_to_use) {
+			rx_intr = &priv->rx_intr[ch->index];
+			if (napi_schedule_prep(rx_napi)) {
+				if (rx_intr->itr_setting & DN200_ITR_DYNAMIC_ITR &&
+						rx_intr->current_itr > DN200_ITR_RWT_BOUND)
+					dn200_rx_watchdog(priv, priv->ioaddr,
+						 DN200_ITR_MAX_RWT, ch->index, priv->hw);
+				__napi_schedule(rx_napi);
+			}
+		}
+		if ((status & handle_tx) &&
+		    chan < priv->plat->tx_queues_to_use) {
+			if (napi_schedule_prep(tx_napi)) {
+				dn200_disable_tx_dma_irq(priv->ioaddr,
+					 ch->index, priv->hw);
+				__napi_schedule(tx_napi);
+			}
+		}
+	}
+
+	return status;
+}
+
+/**
+ * dn200_dma_interrupt - DMA ISR
+ * @priv: driver private structure
+ * Description: this is the DMA ISR. It is called by the main ISR.
+ * It calls the dwmac dma routine and schedule poll method in case of some
+ * work can be done.
+ */
+static void dn200_dma_interrupt(struct dn200_priv *priv)
+{
+	u32 tx_channel_count = priv->plat->tx_queues_to_use;
+	u32 rx_channel_count = priv->plat->rx_queues_to_use;
+	u32 channels_to_check =
+	    tx_channel_count >
+	    rx_channel_count ? tx_channel_count : rx_channel_count;
+	u32 chan;
+	int status[DN200_CH_MAX];
+
+	/* Make sure we never check beyond our status buffer. */
+	if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status)))
+		channels_to_check = ARRAY_SIZE(status);
+
+	for (chan = 0; chan < channels_to_check; chan++)
+		status[chan] = dn200_napi_check(priv, chan, DMA_DIR_RXTX);
+
+	for (chan = 0; chan < tx_channel_count; chan++) {
+		if (unlikely(status[chan] == tx_hard_error))
+			dn200_tx_err(priv, chan);
+	}
+}
+
+/**
+ * dn200_mmc_setup: setup the Mac Management Counters (MMC)
+ * @priv: driver private structure
+ * Description: this masks the MMC irq, in fact, the counters are managed in SW.
+ */
+static void dn200_mmc_setup(struct dn200_priv *priv)
+{
+	unsigned int mode = MMC_CNTRL_RESET_ON_READ | MMC_CNTRL_COUNTER_RESET |
+	    MMC_CNTRL_PRESET | MMC_CNTRL_FULL_HALF_PRESET;
+
+	dn200_mmc_intr_all_mask(priv, priv->mmcaddr);
+
+	if (priv->dma_cap.rmon) {
+		dn200_mmc_ctrl(priv, priv->mmcaddr, mode);
+	} else {
+		netdev_info(priv->dev,
+			    "No MAC Management Counters available\n");
+	}
+}
+
+/**
+ * dn200_check_hw_features_support - get MAC capabilities from the HW cap. register.
+ * @priv: driver private structure
+ * Description:
+ *  new GMAC chip generations have a new register to indicate the
+ *  presence of the optional feature/functions.
+ *  This can be also used to override the value passed through the
+ *  platform and necessary for old MAC10/100 and GMAC chips.
+ */
+static bool dn200_check_hw_features_support(struct dn200_priv *priv)
+{
+	int ret = 0;
+
+	ret = dn200_get_hw_feature(priv, priv->ioaddr, &priv->dma_cap);
+	dn200_sriov_reconfig_hw_feature(priv, &priv->dma_cap);
+
+	if (!ret)
+		return true;
+
+	return false;
+}
+
+/**
+ * dn200_check_ether_addr - check if the MAC addr is valid
+ * @priv: driver private structure
+ * Description:
+ * it is to verify if the MAC address is valid, in case of failures it
+ * generates a random MAC address
+ */
+static void dn200_check_ether_addr(struct dn200_priv *priv)
+{
+	if (!is_valid_ether_addr(priv->dev->dev_addr)) {
+		eth_hw_addr_random(priv->dev);
+		dev_info(priv->device, "device MAC address %pM\n",
+			 priv->dev->dev_addr);
+	} else {
+		eth_hw_addr_set(priv->dev, priv->dev->dev_addr);
+	}
+}
+
+/**
+ * dn200_init_dma_engine - DMA init.
+ * @priv: driver private structure
+ * Description:
+ * It inits the DMA invoking the specific MAC/GMAC callback.
+ * Some DMA parameters can be passed from the platform;
+ * in case of these are not passed a default is kept for the MAC or GMAC.
+ */
+static int dn200_init_dma_engine(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 dma_csr_ch = max(rx_channels_count, tx_channels_count);
+	struct dn200_rx_queue *rx_q;
+	struct dn200_tx_queue *tx_q;
+	u32 chan = 0;
+	int atds = 0;
+	int ret = 0;
+
+	if (!priv->plat->dma_cfg || !priv->plat->dma_cfg->pbl) {
+		dev_err(priv->device, "Invalid DMA configuration\n");
+		return -EINVAL;
+	}
+
+	/* DMA Configuration */
+	dn200_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg, atds, priv->hw);
+
+	if (priv->plat->axi)
+		dn200_axi(priv, priv->ioaddr, priv->plat->axi, priv->hw);
+
+	/* DMA CSR Channel configuration */
+	for (chan = 0; chan < dma_csr_ch; chan++) {
+		dn200_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan,
+				priv->hw);
+		dn200_disable_dma_irq(priv, priv->ioaddr, chan, 1, 1, priv->hw);
+	}
+
+	/* DMA RX Channel Configuration */
+	for (chan = 0; chan < rx_channels_count; chan++) {
+		rx_q = &priv->rx_queue[chan];
+
+		dn200_init_rx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+				   rx_q->dma_rx_phy, chan, priv->hw);
+
+		rx_q->rx_tail_addr = rx_q->dma_rx_phy +
+		    (rx_q->buf_alloc_num * sizeof(struct dma_desc));
+		dn200_set_rx_tail_ptr(priv, priv->ioaddr,
+				      rx_q->rx_tail_addr, chan, priv->hw);
+	}
+
+	/* DMA TX Channel Configuration */
+	for (chan = 0; chan < tx_channels_count; chan++) {
+		tx_q = &priv->tx_queue[chan];
+
+		dn200_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+				   tx_q->dma_tx_phy, chan, priv->hw);
+
+		tx_q->tx_tail_addr = tx_q->dma_tx_phy;
+		dn200_set_tx_tail_ptr(priv, priv->ioaddr,
+				      tx_q->tx_tail_addr, chan, priv->hw);
+	}
+
+	return ret;
+}
+
+static inline void dn200_tx_timer_arm(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	u32 cpuid = 0;
+
+	cpuid = find_last_bit(cpumask_bits(irq_get_affinity_mask(priv->tx_irq[tx_q->queue_index])),
+						nr_cpumask_bits);
+	queue_work_on(cpuid, priv->tx_wq, &tx_q->tx_task);
+	tx_q->task_need_sch = true;
+	atomic_set(&tx_q->txtimer_running, 1);
+}
+
+static inline void dn200_tx_timer_poll(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	u32 cpuid = 0;
+
+	cpuid = find_last_bit(cpumask_bits(irq_get_affinity_mask(priv->tx_irq[tx_q->queue_index])),
+						nr_cpumask_bits);
+	queue_work_on(cpuid, priv->tx_wq, &tx_q->poll_tx_task);
+}
+
+static inline void dn200_rx_timer_poll(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	u32 cpuid = 0;
+
+	cpuid = find_last_bit(cpumask_bits(irq_get_affinity_mask(priv->rx_irq[rx_q->queue_index])),
+						nr_cpumask_bits);
+	queue_work_on(cpuid, priv->tx_wq, &rx_q->poll_rx_task);
+}
+
+/**
+ * dn200_tx_timer - mitigation sw timer for tx.
+ * @t: data pointer
+ * Description:
+ * This is the timer handler to directly invoke the dn200_tx_clean.
+ */
+static enum hrtimer_restart dn200_tx_timer(struct hrtimer *t)
+{
+	struct dn200_tx_queue *tx_q =
+	    container_of(t, struct dn200_tx_queue, txtimer);
+	struct dn200_priv *priv = tx_q->priv_data;
+	struct dn200_channel *ch;
+	struct napi_struct *napi;
+
+	tx_q->txtimer_need_sch = false;
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return HRTIMER_NORESTART;
+
+	ch = &priv->channel[tx_q->queue_index];
+	if (priv->txrx_itr_combined)
+		napi = tx_q->xsk_pool ? &ch->rxtx_napi : &ch->agg_napi;
+	else
+		napi = tx_q->xsk_pool ? &ch->rxtx_napi : &ch->tx_napi;
+
+	if (likely(napi_schedule_prep(napi)))
+		__napi_schedule(napi);
+
+	return HRTIMER_NORESTART;
+}
+
+static void dn200_tx_task(struct work_struct *work)
+{
+	struct dn200_tx_queue *tx_q = container_of(work, struct dn200_tx_queue,
+						   tx_task);
+	struct dn200_priv *priv = tx_q->priv_data;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) || tx_q->txtimer.function == NULL)) {
+		netdev_info(priv->dev, "%s chan %d\n", __func__, tx_q->queue_index);
+		return;
+	}
+
+	hrtimer_start(&tx_q->txtimer,
+		      DN200_COAL_TIMER(priv->tx_coal_timer[tx_q->queue_index] ? : 1),
+		      HRTIMER_MODE_REL);
+	tx_q->task_need_sch = false;
+	tx_q->txtimer_need_sch = true;
+}
+
+/**
+ * dn200_tx_timer - mitigation sw timer for tx.
+ * @t: data pointer
+ * Description:
+ * This is the timer handler to directly invoke the dn200_tx_clean.
+ */
+static enum hrtimer_restart dn200_poll_tx_timer(struct hrtimer *t)
+{
+	struct dn200_tx_queue *tx_q =
+	    container_of(t, struct dn200_tx_queue, poll_txtimer);
+	struct dn200_priv *priv = tx_q->priv_data;
+	struct dn200_channel *ch;
+	struct napi_struct *napi;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return HRTIMER_NORESTART;
+
+	ch = &priv->channel[tx_q->queue_index];
+	if (priv->txrx_itr_combined)
+		napi = tx_q->xsk_pool ? &ch->rxtx_napi : &ch->agg_napi;
+	else
+		napi = tx_q->xsk_pool ? &ch->rxtx_napi : &ch->tx_napi;
+
+	if (likely(napi_schedule_prep(napi)))
+		__napi_schedule(napi);
+
+	return HRTIMER_NORESTART;
+}
+
+static void dn200_poll_tx_task(struct work_struct *work)
+{
+	struct dn200_tx_queue *tx_q = container_of(work, struct dn200_tx_queue,
+						   poll_tx_task);
+	struct dn200_priv *priv = tx_q->priv_data;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return;
+	hrtimer_start(&tx_q->txtimer,
+		      DN200_POLL_TIMER(2), HRTIMER_MODE_REL);
+}
+
+/**
+ * dn200_tx_timer - mitigation sw timer for tx.
+ * @t: data pointer
+ * Description:
+ * This is the timer handler to directly invoke the dn200_tx_clean.
+ */
+static enum hrtimer_restart dn200_poll_rx_timer(struct hrtimer *t)
+{
+	struct dn200_rx_queue *rx_q =
+	    container_of(t, struct dn200_rx_queue, poll_rxtimer);
+	struct dn200_priv *priv = rx_q->priv_data;
+	struct dn200_channel *ch;
+	struct napi_struct *napi;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return HRTIMER_NORESTART;
+
+	ch = &priv->channel[rx_q->queue_index];
+	if (priv->txrx_itr_combined)
+		napi = &ch->agg_napi;
+	else
+		napi = &ch->rx_napi;
+
+	if (likely(napi_schedule_prep(napi)))
+		__napi_schedule(napi);
+
+	return HRTIMER_NORESTART;
+}
+
+static void dn200_poll_rx_task(struct work_struct *work)
+{
+	struct dn200_rx_queue *rx_q = container_of(work, struct dn200_rx_queue,
+						   poll_rx_task);
+	struct dn200_priv *priv = rx_q->priv_data;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return;
+	hrtimer_start(&rx_q->poll_rxtimer,
+		      DN200_POLL_TIMER(2), HRTIMER_MODE_REL);
+}
+
+/**
+ * dn200_init_coalesce - init mitigation options.
+ * @priv: driver private structure
+ * Description:
+ * This inits the coalesce parameters: i.e. timer rate,
+ * timer handler and default threshold used for enabling the
+ * interrupt on completion bit.
+ */
+static void dn200_init_coalesce(struct dn200_priv *priv)
+{
+	u32 tx_channel_count = priv->plat->tx_queues_to_use;
+	u32 rx_channel_count = priv->plat->rx_queues_to_use;
+	u32 chan;
+	struct dn200_channel *ch;
+
+	for (chan = 0; chan < tx_channel_count; chan++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[chan];
+
+		ch = &priv->channel[chan];
+		if (!priv->tx_coal_frames_set[chan] ||
+		    (priv->tx_coal_frames_set[chan] >
+		     ((u32)(priv->dma_tx_size >> 1))))
+			priv->tx_coal_frames_set[chan] =
+			    min((u32)(priv->dma_tx_size >> 1),
+				(u32)DN200_TX_FRAMES);
+		priv->tx_coal_timer[chan] = DN200_COAL_TX_TIMER;
+
+		hrtimer_init(&tx_q->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		tx_q->txtimer.function = dn200_tx_timer;
+
+		INIT_WORK(&tx_q->tx_task, dn200_tx_task);
+
+		hrtimer_init(&tx_q->poll_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		tx_q->poll_txtimer.function = dn200_poll_tx_timer;
+
+		INIT_WORK(&tx_q->poll_tx_task, dn200_poll_tx_task);
+	}
+
+	for (chan = 0; chan < rx_channel_count; chan++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[chan];
+
+		if (!priv->rx_coal_frames[chan])
+			priv->rx_coal_frames[chan] = DN200_RX_FRAMES;
+
+		if ((priv->rx_coal_frames[chan] + dn200_rx_refill_size(priv)) >=
+		    priv->dma_rx_size) {
+			netdev_warn(priv->dev,
+				    "change queue %d rx-frames to %d\n", chan,
+				    DN200_RX_MIN_FRAMES);
+			priv->rx_coal_frames[chan] = DN200_RX_MIN_FRAMES;
+		}
+
+		hrtimer_init(&rx_q->poll_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		rx_q->poll_rxtimer.function = dn200_poll_rx_timer;
+
+		INIT_WORK(&rx_q->poll_rx_task, dn200_poll_rx_task);
+	}
+}
+
+static void dn200_set_rings_length(struct dn200_priv *priv)
+{
+	u32 rx_channels_count = priv->plat->rx_queues_to_use;
+	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 chan;
+
+	/* set TX ring length */
+	for (chan = 0; chan < tx_channels_count; chan++)
+		dn200_set_tx_ring_len(priv, priv->ioaddr,
+				      (priv->dma_tx_size - 1), chan, priv->hw);
+
+	/* set RX ring length */
+	for (chan = 0; chan < rx_channels_count; chan++)
+		dn200_set_rx_ring_len(priv, priv->ioaddr,
+				      (priv->dma_rx_size - 1), chan, priv->hw);
+}
+
+/**
+ *  dn200_set_tx_queue_weight - Set TX queue weight
+ *  @priv: driver private structure
+ *  Description: It is used for setting TX queues weight
+ */
+static void dn200_set_tx_queue_weight(struct dn200_priv *priv)
+{
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+	u32 weight;
+	u32 queue;
+
+	for (queue = 0; queue < tx_queues_count; queue++) {
+		weight = priv->plat->tx_queues_cfg[queue].weight;
+		dn200_set_mtl_tx_queue_weight(priv, priv->hw, weight, queue);
+	}
+}
+
+/**
+ *  dn200_set_rx_queue_weight - Set RX queue weight
+ *  @priv: driver private structure
+ *  Description: It is used for setting RX queues weight
+ */
+static void dn200_set_rx_queue_weight(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 weight;
+	u32 queue;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		weight = priv->plat->rx_queues_cfg[queue].weight;
+		dn200_set_mtl_rx_queue_weight(priv, priv->hw, weight, queue);
+	}
+}
+
+/**
+ *  dn200_configure_cbs - Configure CBS in TX queue
+ *  @priv: driver private structure
+ *  Description: It is used for configuring CBS in AVB TX queues
+ */
+static void dn200_configure_cbs(struct dn200_priv *priv)
+{
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+	u32 mode_to_use;
+	u32 queue;
+
+	/* queue 0 is reserved for legacy traffic */
+	for (queue = 1; queue < tx_queues_count; queue++) {
+		mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use;
+		if (mode_to_use == MTL_QUEUE_DCB)
+			continue;
+
+		dn200_config_cbs(priv, priv->hw,
+				 priv->plat->tx_queues_cfg[queue].send_slope,
+				 priv->plat->tx_queues_cfg[queue].idle_slope,
+				 priv->plat->tx_queues_cfg[queue].high_credit,
+				 priv->plat->tx_queues_cfg[queue].low_credit,
+				 queue);
+	}
+}
+
+/**
+ *  dn200_rx_queue_dma_chan_map - Map RX queue to RX dma channel
+ *  @priv: driver private structure
+ *  Description: It is used for mapping RX queues to RX dma channels
+ */
+static void dn200_rx_queue_dma_chan_map(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	u32 chan;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		chan = priv->plat->rx_queues_cfg[queue].chan;
+		dn200_map_mtl_to_dma(priv, priv->hw, queue, chan);
+	}
+}
+
+/**
+ *  dn200_mac_config_rx_queues_prio - Configure RX Queue priority
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the RX Queue Priority
+ */
+static void dn200_mac_config_rx_queues_prio(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	u32 prio;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		if (!priv->plat->rx_queues_cfg[queue].use_prio)
+			continue;
+
+		prio = priv->plat->rx_queues_cfg[queue].prio;
+		dn200_rx_queue_prio(priv, priv->hw, prio, queue);
+	}
+}
+
+/**
+ *  dn200_mac_config_tx_queues_prio - Configure TX Queue priority
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the TX Queue Priority
+ */
+static void dn200_mac_config_tx_queues_prio(struct dn200_priv *priv)
+{
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+	u32 prio;
+
+	for (queue = 0; queue < tx_queues_count; queue++) {
+		if (!priv->plat->tx_queues_cfg[queue].use_prio)
+			continue;
+
+		prio = priv->plat->tx_queues_cfg[queue].prio;
+		dn200_tx_queue_prio(priv, priv->hw, prio, queue);
+	}
+}
+
+/**
+ *  dn200_mac_config_rx_queues_routing - Configure RX Queue Routing
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the RX queue routing
+ */
+static void dn200_mac_config_rx_queues_routing(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	u8 packet;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		/* no specific packet type routing specified for the queue */
+		if (priv->plat->rx_queues_cfg[queue].pkt_route == 0x0)
+			continue;
+
+		packet = priv->plat->rx_queues_cfg[queue].pkt_route;
+		dn200_rx_queue_routing(priv, priv->hw, packet, queue);
+	}
+}
+
+/**
+ *  dn200_rx_queue_dma_chan_dynamic_map - Map RX queue to RX dma channel as dynamic
+ *  @priv: driver private structure
+ *  Description: It is used for mapping RX MTL queues to RX dma channels as dynamic
+ */
+static void dn200_rx_queue_dma_chan_dynamic_map(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+
+	bool dynamic = true;
+
+	/* if dynamic is true: set all rx queues as dynamic
+	 * if dynamic is false: set all rx queues as static mapping,
+	 *        mtl queue N statically map to dma channel N
+	 *        e.g. enable 4 mtl queues, just 0~3 dma channel or desc rings can receive pkts
+	 *        1. untag pkts route to queue 0,
+	 *        2. tag prio 0 & 1 route to queue 0,
+	 *        3. tag prio 2 & 3 route to queue 1,
+	 *        4. tag prio 0 & 1 route to queue 2,
+	 *        5. tag prio 2 & 3 route to queue 3.
+	 */
+	for (queue = 0; queue < rx_queues_count; queue++)
+		dn200_mtl_dynamic_chan_set(priv, priv->hw, queue, dynamic);
+
+	/* set last queue as dynamic map */
+	if (!PRIV_IS_VF(priv))
+		dn200_mtl_dynamic_chan_set(priv, priv->hw,
+					   DN200_LAST_QUEUE(priv), dynamic);
+}
+
+static void dn200_mac_config_rss(struct dn200_priv *priv)
+{
+	if (!priv->dma_cap.rssen || !priv->plat->rss_en) {
+		priv->rss.enable = false;
+		return;
+	}
+
+	if (priv->dev->features & NETIF_F_RXHASH)
+		priv->rss.enable = true;
+	else
+		priv->rss.enable = false;
+
+	if (priv->plat->rx_queues_to_use <= 1)
+		priv->rss.enable = false;
+	dn200_rss_configure(priv, priv->hw, &priv->rss,
+			    priv->plat->rx_queues_to_use);
+}
+
+/**
+ *  dn200_mtl_configuration - Configure MTL
+ *  @priv: driver private structure
+ *  Description: It is used for configurring MTL
+ */
+static void dn200_mtl_configuration(struct dn200_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+
+	if (tx_queues_count > 1)
+		dn200_set_tx_queue_weight(priv);
+
+	if (rx_queues_count > 1)
+		dn200_set_rx_queue_weight(priv);
+
+	/* Configure MTL RX algorithms */
+	if (rx_queues_count > 1)
+		dn200_prog_mtl_rx_algorithms(priv, priv->hw,
+					     priv->plat->rx_sched_algorithm);
+
+	/* Configure MTL TX algorithms */
+	if (tx_queues_count > 1)
+		dn200_prog_mtl_tx_algorithms(priv, priv->hw,
+					     priv->plat->tx_sched_algorithm);
+
+	/* Configure CBS in AVB TX queues */
+	if (tx_queues_count > 1)
+		dn200_configure_cbs(priv);
+
+	/* Map RX MTL to DMA channels */
+	dn200_rx_queue_dma_chan_map(priv);
+
+	/* Enable MAC RX Queues */
+	dn200_mac_enable_rx_queues(priv);
+
+	/* Set RX priorities */
+	if (rx_queues_count > 1)
+		dn200_mac_config_rx_queues_prio(priv);
+
+	/* Set TX priorities */
+	if (tx_queues_count > 1)
+		dn200_mac_config_tx_queues_prio(priv);
+
+	/* Set RX routing */
+	if (rx_queues_count > 1)
+		dn200_mac_config_rx_queues_routing(priv);
+
+	/* Set MTL queues to DMA channels mapping as dynamic
+	 * rss, l3/l4 filter, vf routing, etc. features should base on it
+	 */
+	dn200_rx_queue_dma_chan_dynamic_map(priv);
+
+	/* Receive Side Scaling */
+	dn200_mac_config_rss(priv);
+}
+
+static void dn200_safety_feat_configuration(struct dn200_priv *priv)
+{
+	if (priv->dma_cap.asp) {
+		dn200_safety_feat_config(priv, priv->ioaddr, priv->dma_cap.asp,
+					 priv->plat->safety_feat_cfg, priv->hw);
+	}
+}
+
+u32 dn200_riwt2usec(u32 riwt, struct dn200_priv *priv)
+{
+	unsigned long clk = clk_get_rate(priv->plat->dn200_clk);
+
+	if (!clk) {
+		clk = priv->plat->clk_ref_rate;
+		if (!clk)
+			return 0;
+	}
+	/* use 512 system clock cycles as rwt units */
+	return (riwt * 512) / (clk / 1000000);
+}
+
+/**
+ * dn200_hw_setup - setup mac in a usable state.
+ *  @dev : pointer to the device structure.
+ *  @ptp_register: register PTP if set
+ *  Description:
+ *  this is the main function to setup the HW in a usable state because the
+ *  dma engine is reset, the core registers are configured (e.g. AXI,
+ *  Checksum features, timers). The DMA is ready to start receiving and
+ *  transmitting.
+ *  Return value:
+ *  0 on success and an appropriate (-)ve integer as defined in errno.h
+ *  file on failure.
+ */
+
+static int dn200_hw_setup(struct net_device *dev, bool ptp_register)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 queue;
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	bool sph_en;
+	u32 chan;
+	int ret;
+
+	/* DMA initialization and SW reset */
+	ret = dn200_init_dma_engine(priv);
+	if (ret < 0) {
+		netdev_err(priv->dev,
+			   "%s: DMA engine common part initialization failed\n",
+			   __func__);
+		return ret;
+	}
+
+	/* PS and related bits will be programmed according to the speed */
+	if (priv->hw->pcs) {
+		int speed = priv->plat->mac_port_sel_speed;
+
+		if (speed == SPEED_10 || speed == SPEED_100 ||
+		    speed == SPEED_1000) {
+			priv->hw->ps = speed;
+		} else {
+			dev_warn(priv->device, "invalid port speed\n");
+			priv->hw->ps = 0;
+		}
+	}
+
+	/* Initialize the MAC Core */
+	dn200_core_init(priv, priv->hw, dev);
+
+	/* Initialize MTL */
+	dn200_mtl_configuration(priv);
+
+	/* Initialize Safety Features */
+	dn200_safety_feat_configuration(priv);
+
+	ret = dn200_rx_ipc(priv, priv->hw);
+	if (!ret) {
+		netdev_warn(priv->dev, "RX IPC Checksum Offload disabled\n");
+		priv->hw->rx_csum = 0;
+	}
+
+	/* Disable the MAC Rx/Tx , mac should enabled after phy link up */
+	dn200_mac_set(priv, priv->ioaddr, false, priv->hw);
+
+	/* Set the HW DMA mode and the COE */
+	dn200_dma_operation_mode(priv);
+
+	if (PRIV_IS_VF(priv)) {
+		priv->dma_cap.rmon = 0;
+		priv->dma_cap.asp = 0;
+	} else {
+		dn200_mmc_setup(priv);
+	}
+
+	if (ptp_register) {
+		ret = clk_prepare_enable(priv->plat->clk_ptp_ref);
+		if (ret < 0)
+			netdev_warn(priv->dev,
+				    "failed to enable PTP reference clock: %pe\n",
+				    ERR_PTR(ret));
+	}
+
+	if (!PRIV_IS_VF(priv)) {
+		ret = dn200_init_ptp(priv);
+		if (ret == -EOPNOTSUPP)
+			netdev_warn(priv->dev, "PTP not supported by HW\n");
+		else if (ret)
+			netdev_warn(priv->dev, "PTP init failed\n");
+		else if (ptp_register)
+			dn200_ptp_register(priv);
+	}
+	priv->eee_tw_timer = DN200_DEFAULT_TWT_LS;
+
+	/* Convert the timer from msec to usec */
+	if (!priv->tx_lpi_timer)
+		priv->tx_lpi_timer = DN200_DEFAULT_LPI_TIMER * 1000;
+
+	for (queue = 0; queue < rx_cnt; queue++) {
+		if (priv->use_riwt) {
+			if (!priv->rx_riwt[queue])
+				priv->rx_riwt[queue] = DEF_DMA_RIWT;
+			priv->rx_rius[queue] =
+			    dn200_riwt2usec(priv->rx_riwt[queue], priv);
+			/*Dynamic itr is supported by default */
+			if (!priv->rx_intr[queue].itr_setting)
+				priv->rx_intr[queue].itr_setting =
+				    (DN200_ITR_DYNAMIC_ITR | 1);
+			priv->rx_intr[queue].target_itr = priv->min_usecs;
+			dn200_rx_watchdog(priv, priv->ioaddr,
+					  priv->rx_riwt[queue], queue,
+					  priv->hw);
+		}
+		if (!priv->tx_intr[queue].itr_setting)
+			priv->tx_intr[queue].itr_setting =
+			    (DN200_ITR_DYNAMIC_ITR | 1);
+		priv->tx_intr[queue].target_itr = DN200_TX_FRAMES;
+	}
+
+	if (priv->hw->pcs)
+		dn200_pcs_ctrl_ane(priv, priv->ioaddr, 1, priv->hw->ps, 0);
+
+	/* set TX and RX rings length */
+	dn200_set_rings_length(priv);
+
+	/* Enable TSO */
+	if (priv->tso) {
+		for (chan = 0; chan < tx_cnt; chan++) {
+			struct dn200_tx_queue *tx_q = &priv->tx_queue[chan];
+
+			/* TSO and TBS cannot co-exist */
+			if (tx_q->tbs & DN200_TBS_AVAIL)
+				continue;
+
+			dn200_enable_tso(priv, priv->ioaddr, 1, chan, priv->hw);
+		}
+	}
+
+	/* Enable Split Header */
+	sph_en = (priv->hw->rx_csum > 0) && priv->sph;
+	for (chan = 0; chan < rx_cnt; chan++)
+		dn200_enable_sph(priv, priv->ioaddr, sph_en, chan, priv->hw);
+
+	/* VLAN Tag Insertion */
+	if (priv->dma_cap.vlins)
+		dn200_enable_vlan(priv, priv->hw,
+				  (priv->dev->features & NETIF_F_HW_VLAN_CTAG_TX) ?
+				  DN200_VLAN_INSERT : DN200_VLAN_NONE);
+
+	/* TBS */
+	for (chan = 0; chan < tx_cnt; chan++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[chan];
+		int enable = tx_q->tbs & DN200_TBS_AVAIL;
+
+		dn200_enable_tbs(priv, priv->ioaddr, enable, chan, priv->hw);
+	}
+	/* DCB */
+	if (!PRIV_IS_VF(priv))
+		dn200_dcbnl_init(priv, true);
+
+	/* Start the ball rolling... */
+	if (!PRIV_IS_VF(priv))
+		dn200_start_all_dma(priv);
+
+	return 0;
+}
+
+static void dn200_hw_teardown(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	clk_disable_unprepare(priv->plat->clk_ptp_ref);
+}
+
+static void dn200_free_irq(struct net_device *dev,
+			   enum request_irq_err irq_err, int irq_idx)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct device *pdev = priv->device;
+	int j;
+
+	switch (irq_err) {
+	case REQ_IRQ_ERR_ALL:
+		irq_idx = priv->plat->tx_queues_to_use;
+		fallthrough;
+	case REQ_IRQ_ERR_RXTX:
+		if (priv->txrx_itr_combined) {
+			for (j = irq_idx - 1; j >= 0; j--) {
+				if (priv->tx_irq[j] > 0) {
+					irq_set_affinity_hint(priv->tx_irq[j],
+							      NULL);
+					synchronize_irq(priv->tx_irq[j]);
+					devm_free_irq(pdev, priv->tx_irq[j],
+						 &priv->channel[j]);
+				}
+				if (priv->rx_irq[j] > 0) {
+					irq_set_affinity_hint(priv->rx_irq[j],
+							      NULL);
+					synchronize_irq(priv->rx_irq[j]);
+					devm_free_irq(pdev, priv->rx_irq[j],
+						 &priv->channel[j]);
+				}
+			}
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_TX:
+		if (!priv->txrx_itr_combined) {
+			for (j = irq_idx - 1; j >= 0; j--) {
+				netdev_dbg(dev, "j %d tx irq %d rx irq %d\n", j,
+					   priv->tx_irq[j], priv->rx_irq[j]);
+				if (priv->tx_irq[j] > 0) {
+					irq_set_affinity_hint(priv->tx_irq[j],
+							      NULL);
+					synchronize_irq(priv->tx_irq[j]);
+					devm_free_irq(pdev, priv->tx_irq[j],
+						 &priv->tx_queue[j]);
+				}
+			}
+			irq_idx = priv->plat->rx_queues_to_use;
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_RX:
+		if (!priv->txrx_itr_combined) {
+			for (j = irq_idx - 1; j >= 0; j--) {
+				if (priv->rx_irq[j] > 0) {
+					irq_set_affinity_hint(priv->rx_irq[j],
+							      NULL);
+					synchronize_irq(priv->rx_irq[j]);
+					devm_free_irq(pdev, priv->rx_irq[j],
+						 &priv->rx_queue[j]);
+				}
+			}
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_SFTY_UE:
+		if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq) {
+			synchronize_irq(priv->sfty_ue_irq);
+			devm_free_irq(pdev, priv->sfty_ue_irq, dev);
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_SFTY_CE:
+		if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq) {
+			synchronize_irq(priv->sfty_ce_irq);
+			devm_free_irq(pdev, priv->sfty_ce_irq, dev);
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_LPI:
+		if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) {
+			synchronize_irq(priv->lpi_irq);
+			devm_free_irq(pdev, priv->lpi_irq, dev);
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_MAC:
+		if (dev->irq > 0) {
+			synchronize_irq(dev->irq);
+			devm_free_irq(pdev, dev->irq, dev);
+		}
+		fallthrough;
+	case REQ_IRQ_ERR_NO:
+		/* If MAC IRQ request error, no more IRQ to free */
+		break;
+	}
+
+	kfree(priv->cpu_mask);
+}
+
+static void dn200_set_affinity_hint(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 max_q = 0, cpu_offset = 0;
+	u16 pf_id = priv->plat_ex->pf_id;
+	int i = 0;
+
+	max_q = max_t(u32, priv->plat->rx_queues_to_use,
+				priv->plat->tx_queues_to_use);
+
+	if (max_q > 2) {
+		cpu_offset = (priv->plat_ex->pf_id ? 8 : 0);
+		for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
+			if (priv->rx_irq[i] == 0)
+				continue;
+			cpumask_clear(priv->cpu_mask);
+			cpumask_set_cpu(cpumask_local_spread
+					(i + cpu_offset, priv->numa_node), priv->cpu_mask);
+			irq_set_affinity_hint(priv->rx_irq[i], priv->cpu_mask);
+		}
+		for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+			if (priv->tx_irq[i] == 0)
+				continue;
+			cpumask_clear(priv->cpu_mask);
+			cpumask_set_cpu(cpumask_local_spread
+					(i + cpu_offset, priv->numa_node), priv->cpu_mask);
+			irq_set_affinity_hint(priv->tx_irq[i], priv->cpu_mask);
+		}
+	} else {
+		if (pf_id < 2) {
+			cpu_offset = (priv->plat_ex->pf_id ? 8 : 0);
+		} else {
+			max_q = max_t(u32, priv->plat->rx_queues_to_use,
+				priv->plat->tx_queues_to_use);
+			cpu_offset = 16 + max_q * (pf_id - 2) * 2;
+		}
+		for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
+			if (priv->rx_irq[i] == 0)
+				continue;
+			cpumask_clear(priv->cpu_mask);
+			cpumask_set_cpu(cpumask_local_spread
+					(i + cpu_offset, priv->numa_node), priv->cpu_mask);
+			irq_set_affinity_hint(priv->rx_irq[i], priv->cpu_mask);
+		}
+		for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+			if (priv->tx_irq[i] == 0)
+				continue;
+			cpumask_clear(priv->cpu_mask);
+			cpumask_set_cpu(cpumask_local_spread
+					(i + cpu_offset + priv->plat->rx_queues_to_use, priv->numa_node), priv->cpu_mask);
+			irq_set_affinity_hint(priv->tx_irq[i], priv->cpu_mask);
+		}
+	}
+}
+
+static int dn200_request_irq_multi_msi(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct device *pdev = priv->device;
+	enum request_irq_err irq_err;
+	u32 cpuid = 0;
+	int irq_idx = 0;
+	char *int_name;
+	int ret;
+	int i;
+
+	priv->cpu_mask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!priv->cpu_mask)
+		return -ENOMEM;
+
+	/* For pmt interrupt */
+	if (dev->irq > 0) {
+		int_name = priv->int_name_mac;
+		sprintf(int_name, "%s:%s", dev->name, "mac");
+		ret = devm_request_irq(pdev, dev->irq, dn200_mac_interrupt,
+				  0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc mac MSI %d (error: %d)\n",
+				   __func__, dev->irq, ret);
+			irq_err = REQ_IRQ_ERR_NO;
+			goto irq_error;
+		}
+	}
+
+	/* Request the LPI IRQ in case of another line
+	 * is used for LPI
+	 */
+	if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) {
+		int_name = priv->int_name_lpi;
+		sprintf(int_name, "%s:%s", dev->name, "lpi");
+		ret = devm_request_irq(pdev, priv->lpi_irq,
+				  dn200_mac_interrupt, 0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc lpi MSI %d (error: %d)\n",
+				   __func__, priv->lpi_irq, ret);
+			irq_err = REQ_IRQ_ERR_MAC;
+			goto irq_error;
+		}
+	}
+
+	/* Request the Safety Feature Correctible Error line in
+	 * case of another line is used
+	 */
+	if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq) {
+		int_name = priv->int_name_sfty_ce;
+		sprintf(int_name, "%s:%s", dev->name, "safety-ce");
+		ret = devm_request_irq(pdev, priv->sfty_ce_irq,
+				  dn200_safety_interrupt, 0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc sfty ce MSI %d (error: %d)\n",
+				   __func__, priv->sfty_ce_irq, ret);
+			irq_err = REQ_IRQ_ERR_LPI;
+			goto irq_error;
+		}
+	}
+
+	/* Request the Safety Feature Uncorrectible Error line in
+	 * case of another line is used
+	 */
+	if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq) {
+		int_name = priv->int_name_sfty_ue;
+		sprintf(int_name, "%s:%s", dev->name, "safety-ue");
+		ret = devm_request_irq(pdev, priv->sfty_ue_irq,
+				  dn200_safety_interrupt, 0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc sfty ue MSI %d (error: %d)\n",
+				   __func__, priv->sfty_ue_irq, ret);
+			irq_err = REQ_IRQ_ERR_SFTY_CE;
+			goto irq_error;
+		}
+	}
+	if (priv->txrx_itr_combined) {
+		/* Request Rx MSI irq */
+		for (i = 0; i < priv->plat->rx_queues_to_use &&
+		     i < priv->plat->tx_queues_to_use; i++) {
+			cpumask_clear(priv->cpu_mask);
+			cpuid =
+			    cpumask_local_spread(i + priv->plat_ex->pf_id * 8,
+						 priv->numa_node);
+			cpumask_set_cpu(cpuid, priv->cpu_mask);
+			if (priv->rx_irq[i]) {
+				int_name = priv->int_name_rx_irq[i];
+				sprintf(int_name, "%s:%s-%d", dev->name, "rx",
+					i);
+				ret =
+				    devm_request_irq(pdev, priv->rx_irq[i],
+						dn200_msi_intr_rxtx, 0,
+						int_name, &priv->channel[i]);
+				if (unlikely(ret < 0)) {
+					netdev_err(priv->dev,
+						   "%s: alloc rx-%d  MSI %d (error: %d)\n",
+						   __func__, i, priv->rx_irq[i],
+						   ret);
+					irq_err = REQ_IRQ_ERR_RXTX;
+					irq_idx = i;
+					goto irq_error;
+				}
+				irq_set_affinity_hint(priv->rx_irq[i],
+						      priv->cpu_mask);
+			}
+			if (priv->tx_irq[i]) {
+				int_name = priv->int_name_tx_irq[i];
+				sprintf(int_name, "%s:%s-%d", dev->name, "tx",
+					i);
+				ret =
+				    devm_request_irq(pdev, priv->tx_irq[i],
+						dn200_msi_intr_rxtx, 0,
+						int_name, &priv->channel[i]);
+				if (unlikely(ret < 0)) {
+					netdev_err(priv->dev,
+						   "%s: alloc tx-%d  MSI %d (error: %d)\n",
+						   __func__, i, priv->tx_irq[i],
+						   ret);
+					devm_free_irq(pdev, priv->rx_irq[i],
+						 &priv->rx_queue[i]);
+					irq_err = REQ_IRQ_ERR_RXTX;
+					irq_idx = i;
+					goto irq_error;
+				}
+				irq_set_affinity_hint(priv->tx_irq[i],
+						      priv->cpu_mask);
+			}
+		}
+	} else {
+		/* Request Rx MSI irq */
+		for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
+			if (priv->rx_irq[i] == 0)
+				continue;
+			int_name = priv->int_name_rx_irq[i];
+			sprintf(int_name, "%s:%s-%d", dev->name, "rx", i);
+			ret = devm_request_irq(pdev, priv->rx_irq[i],
+					  dn200_msi_intr_rx,
+					  0, int_name, &priv->rx_queue[i]);
+			if (unlikely(ret < 0)) {
+				netdev_err(priv->dev,
+					   "%s: alloc rx-%d  MSI %d (error: %d)\n",
+					   __func__, i, priv->rx_irq[i], ret);
+				irq_err = REQ_IRQ_ERR_RX;
+				irq_idx = i;
+				goto irq_error;
+			}
+		}
+
+		/* Request Tx MSI irq */
+		for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+			if (priv->tx_irq[i] == 0)
+				continue;
+			int_name = priv->int_name_tx_irq[i];
+			sprintf(int_name, "%s:%s-%d", dev->name, "tx", i);
+			ret = devm_request_irq(pdev, priv->tx_irq[i],
+					  dn200_msi_intr_tx,
+					  0, int_name, &priv->tx_queue[i]);
+			if (unlikely(ret < 0)) {
+				netdev_err(priv->dev,
+					   "%s: alloc tx-%d  MSI %d (error: %d)\n",
+					   __func__, i, priv->tx_irq[i], ret);
+				irq_err = REQ_IRQ_ERR_TX;
+				irq_idx = i;
+				goto irq_error;
+			}
+		}
+		dn200_set_affinity_hint(dev);
+	}
+
+	ret = 0;
+	return ret;
+irq_error:
+	dn200_free_irq(dev, irq_err, irq_idx);
+	return ret;
+}
+
+static int dn200_request_irq_single(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct device *pdev = priv->device;
+	enum request_irq_err irq_err;
+	int ret;
+
+	ret = devm_request_irq(pdev, dev->irq, dn200_interrupt,
+			  IRQF_SHARED, dev->name, dev);
+	if (unlikely(ret < 0)) {
+		netdev_err(priv->dev,
+			   "%s: ERROR: allocating the IRQ %d (error: %d)\n",
+			   __func__, dev->irq, ret);
+		irq_err = REQ_IRQ_ERR_MAC;
+		goto irq_error;
+	}
+
+	/* Request the IRQ lines */
+	if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) {
+		ret = devm_request_irq(pdev, priv->lpi_irq, dn200_interrupt,
+				  IRQF_SHARED, dev->name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: ERROR: allocating the LPI IRQ %d (%d)\n",
+				   __func__, priv->lpi_irq, ret);
+			irq_err = REQ_IRQ_ERR_LPI;
+			goto irq_error;
+		}
+	}
+
+	return 0;
+
+irq_error:
+	dn200_free_irq(dev, irq_err, 0);
+	return ret;
+}
+
+static int dn200_request_irq(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	/* Request the IRQ lines */
+	if (priv->plat->multi_msi_en)
+		ret = dn200_request_irq_multi_msi(dev);
+	else
+		ret = dn200_request_irq_single(dev);
+
+	return ret;
+}
+
+/*Check dma TX/RX status, if not idle/stop status, we should reset hw.*/
+static int dn200_check_dma_status(struct dn200_priv *priv)
+{
+	u16 chan = 0;
+	int ret = 0;
+
+	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) {
+		ret |=
+		    dn200_check_chan_status(priv, priv->ioaddr, chan, priv->hw,
+					    true);
+	}
+	for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+		ret |=
+		    dn200_check_chan_status(priv, priv->ioaddr, chan, priv->hw,
+					    false);
+	}
+	if (ret) {
+		netdev_warn(priv->dev,
+			    "dma status is in error status before open\n");
+		return -1;
+	}
+	return 0;
+}
+
+static void dn200_set_vf_heart_vf_state(struct dn200_priv *priv, u8 val,
+					bool enable)
+{
+	u8 registered_vf_state = 0;
+
+	DN200_HEARTBEAT_GET(priv->hw, registered_vf_state,
+			    priv->plat_ex->vf_offset, &registered_vf_state);
+	if (enable)
+		registered_vf_state |= val;
+	else
+		registered_vf_state &= ~val;
+	DN200_HEARTBEAT_SET(priv->hw, registered_vf_state,
+			    priv->plat_ex->vf_offset, registered_vf_state);
+}
+
+static void dn200_vf_init_heartbeat(struct dn200_priv *priv)
+{
+	u8 last_beat, beat;
+
+	if (!PRIV_IS_VF(priv))
+		return;
+	DN200_HEARTBEAT_SET(priv->hw, registered_vf_state,
+			    priv->plat_ex->vf_offset,
+			    DN200_VF_REG_STATE_OPENED);
+	DN200_HEARTBEAT_GET(priv->hw, last_heartbeat, priv->plat_ex->vf_offset,
+			    &last_beat);
+	beat = !last_beat;
+	DN200_HEARTBEAT_SET(priv->hw, heartbeat, priv->plat_ex->vf_offset,
+			    beat);
+}
+
+static void dn200_set_am_and_pm(struct dn200_priv *priv)
+{
+	u32 value = 0;
+
+	value &= ~(XGMAC_FILTER_PR | XGMAC_FILTER_HMC | XGMAC_FILTER_PM);
+	value |= XGMAC_FILTER_PR;
+	writel(value, priv->ioaddr + XGMAC_PACKET_FILTER);
+}
+
+static int _dn200_set_umac_addr(struct dn200_priv *priv, unsigned char *addr, u8 reg_n)
+{
+	int ret = 0;
+	u8 wakeup_wq = false;
+
+	ret = dn200_set_umac_addr(priv, priv->hw, addr, reg_n, &wakeup_wq);
+	if (ret < 0) {
+		netdev_err(priv->dev, "%s: set umac fail.\n", __func__);
+		return ret;
+	}
+	priv->pf_rxp_set |= RXP_SET_UMAC;
+	if (wakeup_wq && PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+		queue_work(priv->wq, &priv->rxp_task);
+	else if (PRIV_SRIOV_SUPPORT(priv))
+		set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+
+	if (PRIV_IS_VF(priv)) {
+		netdev_dbg(priv->dev, "%s: notify pf set umac.\n", __func__);
+		DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, RXP_TASK, 1);
+		irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+	}
+	return ret;
+}
+
+static void _dn200_config_vlan_rx_fltr(struct dn200_priv *priv, struct mac_device_info *hw, bool enable)
+{
+	if (PRIV_IS_PUREPF(priv)) {
+		dn200_config_vlan_rx_fltr(priv, priv->hw, enable);
+	} else if (PRIV_SRIOV_SUPPORT(priv)) {
+		priv->pf_rxp_set |= RXP_SET_VLAN_FIL;
+		priv->vlan_fil_enable = enable;
+		if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+			queue_work(priv->wq, &priv->rxp_task);
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+	}
+}
+
+static int dn200_set_features(struct net_device *netdev,
+			      netdev_features_t features);
+static void dn200_set_rx_mode(struct net_device *dev);
+static void dn200_vlan_reconfig(struct dn200_priv *priv);
+static void dn200_fdirs_reconfig(struct dn200_priv *priv, bool enable);
+
+static void dn200_eth_reconfig(struct dn200_priv *priv)
+{
+	struct net_device *dev = priv->dev;
+	int ret = 0;
+	/*if sriov support && sriov enable */
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		/*enable DDS and mcbc duplicate */
+		dn200_set_am_and_pm(priv);
+		dn200_rx_dds_config(priv, priv->hw, true);
+	}
+
+	ret = _dn200_set_umac_addr(priv, (unsigned char *)priv->dev->dev_addr, 0);
+	if (ret < 0)
+		netdev_err(priv->dev, "%s: set umac fail.\n", __func__);
+	/*init vlan filter */
+	if (!HW_IS_VF(priv->hw))
+		dn200_init_hw_vlan_rx_fltr(priv, priv->hw);
+	dn200_set_features(dev, dev->features);
+	dn200_set_rx_mode(dev);
+	dn200_set_mac_loopback(priv, priv->ioaddr,
+			       !!(dev->features & NETIF_F_LOOPBACK));
+	dn200_config_hw_tstamping(priv, priv->ptpaddr, priv->systime_flags);
+	if (!HW_IS_VF(priv->hw)) {
+		dn200_rxp_config(priv, priv->hw, priv->tc_entries,
+				 priv->tc_entries_max);
+	}
+	if (!HW_IS_VF(priv->hw))
+		dn200_vlan_reconfig(priv);
+	if (!HW_IS_VF(priv->hw)) {
+		dn200_fdirs_reconfig(priv, true);
+		dn200_rss_configure(priv, priv->hw, &priv->rss,
+				    priv->plat->rx_queues_to_use);
+	}
+	if (priv->vxlan_port)
+		dn200_vxlan_set(dev);
+
+	if (!(PRIV_IS_PUREPF(priv) || PRIV_IS_VF(priv))) {
+		_dn200_config_vlan_rx_fltr(priv, priv->hw,
+					  !(priv->dev->flags & IFF_PROMISC));
+	}
+}
+
+static inline void dn200_rx_itr_usec_update(struct dn200_priv *priv)
+{
+	/* for 10Gbit/s bandwidth(or 1250Byte/us),
+	 * packet total size = (mtu * dma_rx_size) can hold
+	 * rx_usec = ((mtu * priv->dma_rx_size)/1250),
+	 * so rx watchdog should take effect within 1/2 * rx_usec
+	 */
+	const int mtu = priv->dev->mtu;
+
+	priv->rx_itr_usec = (((mtu * priv->dma_rx_size) / 125) * 1000 / priv->speed) >> 1;
+	if (priv->rx_itr_usec > priv->max_usecs)
+		priv->rx_itr_usec = priv->max_usecs;
+	priv->rx_itr_usec_min = priv->min_usecs;
+	dev_dbg(priv->device, "%s, %d, priv->rx_itr_usec:%u, priv->rx_itr_usec_min:%u\n",
+			__func__, __LINE__, priv->rx_itr_usec, priv->rx_itr_usec_min);
+}
+
+static int dn200_reset(struct dn200_priv *priv)
+{
+	int ret = 0;
+
+	if (!priv->mii) {
+		/*add phy clock's stable judge*/
+		ret = dn200_phy_clock_stable_judge(PRIV_PHY_INFO(priv));
+		if (ret) {
+			netdev_warn(priv->dev,
+				 "%s %d phy clock maybe not stable,give more time\n",
+					 __func__, __LINE__);
+			usleep_range(1000000, 2000000);
+		}
+	}
+
+	ret = dn200_dma_reset(priv, priv->ioaddr, priv->hw);
+	if (ret) {
+		dev_err(priv->device, "dma reset err\n");
+		return ret;
+	}
+
+	ret = dn200_rxf_and_acl_mem_reset(priv, priv->hw);
+	if (ret) {
+		dev_err(priv->device, "rxf reset err\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int dn200_open_continue(struct dn200_priv *priv)
+{
+	u8 vf_link_notify = 0;
+	int ret = 0;
+
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		set_bit(DN200_DEV_INIT, &priv->state);
+
+		/* reset rxp */
+		ret = dn200_reset_rxp(priv, priv->hw);
+
+		/* init broadcast rxp */
+		if (ret == 0)
+			ret = dn200_rxp_broadcast(priv, priv->hw);
+	}
+
+	/* if rxp init failed, clear down & opening state, but keep dev init state */
+	if (ret) {
+		netdev_info(priv->dev, "%s, %d, rxp init failure.\n", __func__, __LINE__);
+		goto err_rxp;
+	}
+
+	/* can't configure rxp mac, vlan, filter before dev init complete */
+	clear_bit(DN200_DEV_INIT, &priv->state);
+
+	if (PRIV_IS_VF(priv))
+		clear_bit(DN200_VF_IN_STOP, &priv->state);
+	/* ethernet reconfig for two conditions:
+	 * 1. dev down and open, to restore user configurations
+	 * 2. protocol stack configurations set in the process of dev open
+	 */
+	dn200_eth_reconfig(priv);
+
+	netif_tx_start_all_queues(priv->dev);
+	/* if occur err, don't start phy and keep carrier off */
+	if (PRIV_PHY_OPS_CHECK(priv) && PRIV_PHY_OPS(priv)->start)
+		PRIV_PHY_OPS(priv)->start(PRIV_PHY_INFO(priv));
+
+	if (PRIV_IS_VF(priv))
+		dn200_vf_init_heartbeat(priv);
+	timer_setup(&priv->keepalive_timer, dn200_heartbeat, 0);
+	mod_timer(&priv->keepalive_timer, msecs_to_jiffies(1000));
+
+	if (PRIV_IS_VF(priv)) {
+		DN200_VF_LINK_GET(priv, priv->plat_ex->vf_offset,
+				  &vf_link_notify);
+		if (!vf_link_notify)
+			DN200_VF_LINK_SET(priv, priv->plat_ex->vf_offset, 1);
+	} else {
+		DN200_SET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, 1);
+	}
+	/* keep following code at the end !!! */
+err_rxp:
+	clear_bit(DN200_DCB_DOWN, &priv->state);
+	clear_bit(DN200_DOWN, &priv->state);
+
+	/* must notify vf to reset after reconfig, otherwise vf can't work */
+	if (ret == 0 && priv->plat_ex->sriov_cfg)
+		_dn200_vf_flow_open(priv);
+
+	return ret;
+}
+
+/**
+ *  dn200_open - open entry point of the driver
+ *  @dev : pointer to the device structure.
+ *  Description:
+ *  This function is the open entry point of the driver.
+ *  Return value:
+ *  0 on success and an appropriate (-)ve integer as defined in errno.h
+ *  file on failure.
+ */
+
+static int dn200_open(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int bfsize = 0;
+	u32 chan;
+	u8 states = 0;
+	int ret = 0, dma_chan_err = 0;
+
+	if (test_bit(DN200_SYS_SUSPENDED, &priv->state))
+		return 0;
+
+	if (test_and_set_bit(DN200_UP, &priv->state))
+		return 0;
+	/* forbid to set rxp in dev init state;
+	 * clear dev init state when rxp complete to reset & init
+	 */
+	set_bit(DN200_DEV_INIT, &priv->state);
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		clear_bit(DN200_UP, &priv->state);
+		return -EBUSY;
+	}
+
+	if (test_bit(DN200_DEV_ERR_CLOSE, &priv->state)) {
+		netdev_err(dev, "%s: %s\n", __func__, DN200_FW_ERR_MSG);
+		clear_bit(DN200_UP, &priv->state);
+		return -EBUSY;
+	}
+	if (!PRIV_IS_VF(priv)) {
+		unsigned long time_start = jiffies;
+
+		while (!test_bit(DN200_PROBE_FINISHED, &priv->state)) {
+			usleep_range(1000, 2000);
+			if (time_after(jiffies, time_start + msecs_to_jiffies(4000))) {
+				netdev_err(dev, "%s: probing took 4s, but not finish, may have errs\n", __func__);
+				clear_bit(DN200_UP, &priv->state);
+				return -EBUSY;
+			}
+		}
+	}
+	if (PRIV_IS_VF(priv)) {
+		DN200_GET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, &states);
+		if (!states) {
+			netdev_err(dev,
+				   "Unable to start - perhaps the PF Driver isn't up yet.\n");
+			clear_bit(DN200_UP, &priv->state);
+			return -EAGAIN;
+		}
+	}
+	netif_carrier_off(dev);
+	if (PRIV_IS_VF(priv))
+		dn200_stop_all_dma(priv);
+	if (dn200_iatu_init(priv) < 0)
+		goto iatu_init_error;
+
+	/* limit the rx buffer size to 1536 or 3K */
+	bfsize = dn200_get_bfsize();
+	priv->dma_buf_sz = bfsize;
+
+	if (!priv->dma_tx_size)
+		priv->dma_tx_size = DMA_DEFAULT_TX_SIZE;
+	if (!priv->dma_rx_size) {
+		if (PRIV_IS_VF(priv))
+			priv->dma_rx_size = DMA_DEFAULT_VF_RX_SIZE;
+		else
+			priv->dma_rx_size = DMA_DEFAULT_RX_SIZE;
+	}
+
+	/* Earlier check for TBS */
+	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[chan];
+		int tbs_en = priv->plat->tx_queues_cfg[chan].tbs_en;
+
+		/* Setup per-TXQ tbs flag before TX descriptor alloc */
+		tx_q->tbs |= tbs_en ? DN200_TBS_AVAIL : 0;
+	}
+	ret = alloc_dma_desc_resources(priv);
+	if (ret < 0) {
+		netdev_err(priv->dev, "%s: DMA descriptors allocation failed\n",
+			   __func__);
+		goto dma_desc_error;
+	}
+	ret = init_dma_desc_rings(dev, GFP_KERNEL);
+	if (ret < 0) {
+		netdev_err(priv->dev,
+			   "%s: DMA descriptors initialization failed\n",
+			   __func__);
+		goto init_error;
+	}
+	dn200_add_rx_iatu2tx(priv);
+	dn200_mac_set(priv, priv->ioaddr, false, priv->hw);
+	if (PRIV_IS_VF(priv)) {
+		/* check all dma channel status, if any channel existed error,
+		 * will call global error reset in the end
+		 */
+		dma_chan_err = dn200_check_dma_status(priv);
+	} else {
+		if (priv->mii) {
+			/* set rgmii rx clock from soc */
+			dn200_xgmac_rx_ext_clk_set(priv, false);
+			/* workaround: set phy loopback for reg timeout */
+			mdiobus_write(priv->mii, priv->plat->phy_addr, 0, 0x4140);
+		}
+		/* hw reset when pure pf or sriov pf up */
+		ret = dn200_reset(priv);
+		if (ret) {
+			dev_err(priv->device, "Failed to reset the dma\n");
+			goto init_error;
+		}
+		if (PRIV_SRIOV_SUPPORT(priv) && !PRIV_IS_VF(priv))
+			dn200_sriov_mail_init(priv);
+	}
+	if (priv->hw->pcs != DN200_PCS_TBI && priv->hw->pcs != DN200_PCS_RTBI) {
+		ret = dn200_init_phy(dev);
+		if (ret) {
+			netdev_err(priv->dev,
+				   "%s: Cannot attach to PHY (error: %d)\n",
+				   __func__, ret);
+			goto init_error;
+		}
+	}
+	ret = dn200_hw_setup(dev, true);
+	if (ret < 0) {
+		netdev_err(priv->dev, "%s: Hw setup failed\n", __func__);
+		goto init_error;
+	}
+	/* Configure real RX and TX queues */
+	netif_set_real_num_rx_queues(dev, priv->plat->rx_queues_to_use);
+	netif_set_real_num_tx_queues(dev, priv->plat->tx_queues_to_use);
+
+	dn200_init_coalesce(priv);
+	ret = dn200_request_irq(dev);
+	if (ret)
+		goto irq_error;
+
+	netdev_update_features(dev);
+	if (PRIV_IS_PUREPF(priv))
+		dn200_napi_add(priv->dev);
+	dn200_enable_all_queues(priv);
+	dn200_enable_all_dma_irq(priv);
+	/*modify vf tx timeout to 15s to avoid vm pending*/
+
+	if (dma_chan_err) {
+		dn200_global_err(priv, DN200_DMA_CHAN_ERR);
+	} else {
+		if (priv->mii) {
+			/* cancel phy loopback */
+			mdiobus_write(priv->mii, priv->plat->phy_addr, 0, 0x9140);
+		}
+	}
+
+	/* base on new desc ring size to update rx interrupt usec */
+	dn200_rx_itr_usec_update(priv);
+
+	/* reconfig pf & vf settings */
+	if (!dma_chan_err)
+		dn200_open_continue(priv);
+
+	if (PRIV_IS_VF(priv))
+		dn200_start_all_dma(priv);
+	return 0;
+
+irq_error:
+	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) {
+		if (priv->tx_queue[chan].tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].tx_task);
+		if (priv->tx_queue[chan].txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].txtimer);
+		memset(&priv->tx_queue[chan].txtimer, 0,
+		       sizeof(struct hrtimer));
+		if (priv->tx_queue[chan].poll_tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].poll_tx_task);
+		if (priv->tx_queue[chan].poll_txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].poll_txtimer);
+		memset(&priv->tx_queue[chan].poll_txtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+		if (priv->rx_queue[chan].poll_rx_task.func)
+			cancel_work_sync(&priv->rx_queue[chan].poll_rx_task);
+		if (priv->rx_queue[chan].poll_rxtimer.function)
+			hrtimer_cancel(&priv->rx_queue[chan].poll_rxtimer);
+		memset(&priv->rx_queue[chan].poll_rxtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	dn200_hw_teardown(dev);
+init_error:
+	free_dma_desc_resources(priv);
+
+dma_desc_error:
+iatu_init_error:
+	clear_bit(DN200_UP, &priv->state);
+	return ret;
+}
+
+static void dn200_vf_clear_heartbeat(struct dn200_priv *priv)
+{
+	if (!PRIV_IS_VF(priv))
+		return;
+	dn200_set_vf_heart_vf_state(priv, DN200_VF_REG_STATE_OPENED, false);
+	DN200_HEARTBEAT_SET(priv->hw, last_heartbeat, priv->plat_ex->vf_offset,
+				0);
+	DN200_HEARTBEAT_SET(priv->hw, heartbeat, priv->plat_ex->vf_offset, 0);
+}
+
+static void dn200_release_remain(struct dn200_priv *priv, bool pcie_ava)
+{
+	u8 vf_link_notify = 0;
+	/* cancel the reconfig task used by hw locked failure when open */
+	if (priv->reconfig_task.func)
+		cancel_work_sync(&priv->reconfig_task);
+	del_timer_sync(&priv->keepalive_timer);
+	if (priv->eee_enabled) {
+		priv->tx_path_in_lpi_mode = false;
+		del_timer_sync(&priv->eee_ctrl_timer);
+	}
+	/* DCB */
+	if (!PRIV_IS_VF(priv) && pcie_ava)
+		dn200_dcbnl_init(priv, false);
+	dn200_release_ptp(priv);
+
+	if (PRIV_PHY_OPS_CHECK(priv) && PRIV_PHY_OPS(priv)->stop)
+		PRIV_PHY_OPS(priv)->stop(PRIV_PHY_INFO(priv));
+	set_bit(DN200_DCB_DOWN, &priv->state);
+	if (PRIV_IS_VF(priv) && pcie_ava) {
+		DN200_VF_LINK_GET(priv, priv->plat_ex->vf_offset,
+				&vf_link_notify);
+		if (vf_link_notify)
+			DN200_VF_LINK_SET(priv, priv->plat_ex->vf_offset, 0);
+	}
+	memset(&priv->hw->set_state, 0, sizeof(struct dn200_set_state));
+}
+
+/**
+ *  dn200_release - close entry point of the driver
+ *  @dev : device pointer.
+ *  Description:
+ *  This is the stop entry point of the driver.
+ */
+static int dn200_release(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 chan = 0;
+	u8 vf_link_notify = 0;
+	int rx_state = 0;
+	bool pcie_ava = false;
+
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state)) {
+		pcie_ava = dn200_hwif_id_check(priv->ioaddr);
+		if (!pcie_ava)
+			set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+	}
+
+	if (test_bit(DN200_DOWN, &priv->state) && priv->vf_sw_close_flag && PRIV_IS_VF(priv)) {
+		dn200_release_remain(priv, pcie_ava);
+		priv->vf_sw_close_flag = false;
+		return 0;
+	}
+	if (test_and_set_bit(DN200_DOWN, &priv->state))
+		return 0;
+	if (pcie_ava) {
+		if (!PRIV_IS_VF(priv))
+			dn200_flow_ctrl(priv, priv->hw, false, FLOW_OFF,
+			 0, priv->plat->tx_queues_to_use);
+		if (PRIV_PHY_OPS_CHECK(priv) && PRIV_PHY_OPS(priv)->phy_timer_del)
+			PRIV_PHY_OPS(priv)->phy_timer_del(PRIV_PHY_INFO(priv));
+		/* stop all vfs flow when pf down or release */
+		rx_state = dn200_mac_rx_get(priv, priv->ioaddr);
+		if (rx_state)
+			dn200_mac_rx_set(priv, priv->ioaddr, false);
+		if (!test_bit(DN200_DEV_ERR_CLOSE, &priv->state))
+			dn200_vf_flow_close(priv);
+	}
+	/* cancel the reconfig task used by hw locked failure when open */
+	if (priv->reconfig_task.func)
+		cancel_work_sync(&priv->reconfig_task);
+	if (pcie_ava) {
+		if (!PRIV_IS_VF(priv))
+			DN200_SET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, 0);
+	}
+	del_timer_sync(&priv->keepalive_timer);
+	if (pcie_ava) {
+		/* disable mac rx engine before clean tx queues and del rxp */
+		if (PRIV_SRIOV_SUPPORT(priv))
+			dn200_clean_all_tx_queues(priv, priv->plat_ex->tx_queues_total);
+		else
+			dn200_clean_all_tx_queues(priv, priv->plat->tx_queues_to_use);
+		/* release rxp resource firstly to prevents
+		 * broadcast packets from being sent to VF
+		 */
+		if (PRIV_IS_VF(priv)) {
+			dn200_vf_clear_heartbeat(priv);
+			dn200_vf_del_rxp(priv, priv->hw);
+			if (PRIV_IS_VF(priv)) {
+				netdev_dbg(priv->dev, "%s: notify pf set umac.\n", __func__);
+				DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, RXP_TASK, 1);
+				irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+			}
+		}
+	}
+
+	dn200_disable_all_queues(priv);
+	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) {
+		if (priv->tx_queue[chan].tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].tx_task);
+		if (priv->tx_queue[chan].txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].txtimer);
+		memset(&priv->tx_queue[chan].txtimer, 0,
+		       sizeof(struct hrtimer));
+		if (priv->tx_queue[chan].poll_tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].poll_tx_task);
+		if (priv->tx_queue[chan].poll_txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].poll_txtimer);
+		memset(&priv->tx_queue[chan].poll_txtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+		if (priv->rx_queue[chan].poll_rx_task.func)
+			cancel_work_sync(&priv->rx_queue[chan].poll_rx_task);
+		if (priv->rx_queue[chan].poll_rxtimer.function)
+			hrtimer_cancel(&priv->rx_queue[chan].poll_rxtimer);
+		memset(&priv->rx_queue[chan].poll_rxtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	if (!PRIV_IS_VF(priv)) {
+		if (!PRIV_IS_PUREPF(priv))
+			udelay(100);
+		else
+			usleep_range(1000, 2000);
+	}
+	/* Free the IRQ lines */
+	dn200_free_irq(dev, REQ_IRQ_ERR_ALL, 0);
+	if (PRIV_IS_PUREPF(priv))
+		dn200_napi_del(priv->dev);
+	if (priv->eee_enabled) {
+		priv->tx_path_in_lpi_mode = false;
+		del_timer_sync(&priv->eee_ctrl_timer);
+	}
+
+	if (pcie_ava) {
+		/* Stop TX/RX DMA and clear the descriptors */
+		dn200_stop_all_dma(priv);
+
+		/* enable mac rx engine state to deal with vf down but pf up */
+		if (PRIV_IS_VF(priv) && rx_state)
+			dn200_mac_rx_set(priv, priv->ioaddr, true);
+	}
+	/* DCB */
+	if (!PRIV_IS_VF(priv) && pcie_ava)
+		dn200_dcbnl_init(priv, false);
+	/* Release and free the Rx/Tx resources */
+	free_dma_desc_resources(priv);
+	if (pcie_ava) {
+		dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+		/* Disable the MAC Rx/Tx */
+		dn200_mac_set(priv, priv->ioaddr, false, priv->hw);
+	}
+
+	dn200_release_ptp(priv);
+
+	if (PRIV_PHY_OPS_CHECK(priv) && PRIV_PHY_OPS(priv)->stop)
+		PRIV_PHY_OPS(priv)->stop(PRIV_PHY_INFO(priv));
+	if (pcie_ava) {
+		if (priv->mii)
+			/* set rgmii rx clock from soc */
+			dn200_xgmac_rx_ext_clk_set(priv, false);
+	}
+	set_bit(DN200_DCB_DOWN, &priv->state);
+	if (PRIV_IS_VF(priv) && pcie_ava) {
+		DN200_VF_LINK_GET(priv, priv->plat_ex->vf_offset,
+				  &vf_link_notify);
+		if (vf_link_notify)
+			DN200_VF_LINK_SET(priv, priv->plat_ex->vf_offset, 0);
+	}
+	memset(&priv->hw->set_state, 0, sizeof(struct dn200_set_state));
+
+	clear_bit(DN200_UP, &priv->state);
+	return 0;
+}
+
+static bool dn200_vlan_insert(struct dn200_priv *priv, struct sk_buff *skb,
+			      struct dn200_tx_queue *tx_q)
+{
+	u16 tag = 0x0, inner_tag = 0x0;
+	u32 inner_type = 0x0;
+	struct dma_desc *p;
+
+	if (!(priv->dev->features & NETIF_F_HW_VLAN_CTAG_TX))
+		return false;
+	if (!priv->dma_cap.vlins)
+		return false;
+	if (!skb_vlan_tag_present(skb))
+		return false;
+	if (skb->vlan_proto == htons(ETH_P_8021AD))
+		return false;
+
+	tag = skb_vlan_tag_get(skb);
+
+	p = &tx_q->dma_tx[tx_q->cur_tx];
+
+	if (dn200_set_desc_vlan_tag(priv, p, tag, inner_tag, inner_type))
+		return false;
+	priv->swc.mmc_tx_vlan_insert++;
+	dn200_set_tx_owner(priv, p);
+	tx_q->cur_tx = DN200_GET_ENTRY(tx_q->cur_tx, priv->dma_tx_size);
+	return true;
+}
+
+/**
+ *  dn200_tso_allocator - close entry point of the driver
+ *  @priv: driver private structure
+ *  @des: buffer start address
+ *  @total_len: total length to fill in descriptors
+ *  @last_segment: condition for the last descriptor
+ *  @queue: TX queue index
+ *  Description:
+ *  This function fills descriptor and request new descriptors according to
+ *  buffer length to fill
+ */
+static void dn200_tso_allocator(struct dn200_priv *priv, dma_addr_t des,
+				int total_len, bool last_segment, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	struct dma_desc *desc;
+	u32 buff_size;
+	int tmp_len;
+
+	tmp_len = total_len;
+
+	while (tmp_len > 0) {
+		dma_addr_t curr_addr;
+
+		tx_q->cur_tx = DN200_GET_ENTRY(tx_q->cur_tx, priv->dma_tx_size);
+		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
+
+		desc = &tx_q->dma_tx[tx_q->cur_tx];
+
+		curr_addr = des + (total_len - tmp_len);
+		dn200_set_desc_addr(priv, desc, curr_addr, priv->hw);
+
+		buff_size =
+		    tmp_len >= TSO_MAX_BUFF_SIZE ? TSO_MAX_BUFF_SIZE : tmp_len;
+
+		dn200_prepare_tso_tx_desc(priv, desc, 0, buff_size,
+						 0, 1,
+						 (last_segment) &&
+						 (tmp_len <= TSO_MAX_BUFF_SIZE), 0,
+						 0);
+		tmp_len -= TSO_MAX_BUFF_SIZE;
+	}
+}
+
+static void dn200_flush_tx_descriptors(struct dn200_priv *priv, int queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	int desc_size;
+
+	desc_size = sizeof(struct dma_desc);
+
+	/* The own bit must be the latest setting done when prepare the
+	 * descriptor and then barrier is needed to make sure that
+	 * all is coherent before granting the DMA engine.
+	 */
+	wmb();
+
+	tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * desc_size);
+	dn200_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue,
+			      priv->hw);
+}
+
+static bool dn200_is_vxlan(struct sk_buff *skb)
+{
+	if (!skb->encapsulation)
+		return false;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (ip_hdr(skb)->protocol != IPPROTO_UDP)
+			return false;
+		break;
+
+	case htons(ETH_P_IPV6):
+		if (ipv6_hdr(skb)->nexthdr != IPPROTO_UDP)
+			return false;
+		break;
+
+	default:
+		return false;
+	}
+	if (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
+	    skb->inner_protocol != htons(ETH_P_TEB) ||
+	    (skb_inner_mac_header(skb) - skb_transport_header(skb) !=
+	     sizeof(struct udphdr) + sizeof(struct vxlanhdr)))
+		return false;
+	return true;
+}
+
+static int dn200_dma32_buf_get(struct dn200_priv *priv,
+			       struct dn200_tx_queue *tx_q, int buf_len,
+			       int entry)
+{
+	int order = 0, pgs = 0;
+	struct dn200_tx_dma32_buff *tx_buf = &tx_q->tx_dma32_bufs[entry];
+
+	if (tx_buf->mem_type == DN200_NORMAL) {
+		pgs = DIV_ROUND_UP(buf_len, PAGE_SIZE);
+		order = ilog2(roundup_pow_of_two(pgs));
+
+		tx_buf->page = __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN | GFP_DMA32, order);
+		if (!tx_buf->page) {
+			dev_err(priv->device, "no page memory: %s, %d.\n",
+				__func__, __LINE__);
+			return -1;
+		}
+		tx_buf->order = order;
+	} else {
+		dev_err(priv->device,
+			"%s, %d, tx buf is dma32 already, should release it before use, entry:%d, queue:%d, buf:%pa.\n",
+			__func__, __LINE__, entry, tx_q->queue_index,
+			&tx_buf->buf);
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ *  dn200_dma32_allocator - alloc a dma32 tx buffer from input queue's struct
+ *  @priv: driver private structure
+ *  @data: data buffer start address
+ *  @total_len: total length of data
+ *  @queue: TX queue index
+ *  @entry: Tx buffer index
+ *  Description:
+ *  This function allocate dma32 buffer and map to dma address,
+ *  if success return the mapped dma address, if failure return 0
+ */
+static dma_addr_t dn200_dma32_allocator(struct dn200_priv *priv, void *data,
+					int total_len, u32 queue, int entry,
+					u64 *base_addr)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	struct page *page = NULL;
+	dma_addr_t dma_addr;
+	int ret = 0;
+	u8 *dest = NULL;
+
+	ret = dn200_dma32_buf_get(priv, tx_q, total_len, entry);
+	if (ret < 0)
+		return 0;
+
+	page = tx_q->tx_dma32_bufs[entry].page;
+	if (!page) {
+		dev_err(priv->device,
+			"page is null or dma32 buf get failure: %s, %d, entry:%d.\n",
+			__func__, __LINE__, entry);
+		return 0;
+	}
+
+	dest = page_address(page);
+	memcpy(dest, (u8 *)data, total_len);
+
+	dma_addr = dma_map_page_attrs(priv->device, page, 0, total_len, DMA_TO_DEVICE,
+			     (DMA_ATTR_SKIP_CPU_SYNC |
+				 DMA_ATTR_WEAK_ORDERING));
+	if (dma_mapping_error(priv->device, dma_addr)) {
+		dev_err(priv->device, "dma map error: %s, %d.\n", __func__,
+			__LINE__);
+		__free_pages(page, tx_q->tx_dma32_bufs[entry].order);
+		return 0;
+	}
+
+	/* should find the iatu base address for address lower or higher than 32 bit
+	 * when iommu enabled, the dma address can higher than 32 bit,
+	 * 1. when lower than 32 bit, just use dma32 iatu base address
+	 * 2. when higher that 32 bit, use a new iatu entry's base address
+	 */
+	*base_addr = dma_addr;
+	if (!dma_can_direct_use(priv, dma_addr)) {
+		if (dn200_tx_iatu_find
+		    (dma_addr, tx_q, &tx_q->tx_dma32_bufs[entry].iatu_ref_ptr,
+		     base_addr) < 0) {
+			dev_err(priv->device,
+				"%s, %d, dma32 addr iatu find failed, dma_addr:%#llx, page phys:%#llx\n",
+				__func__, __LINE__, dma_addr,
+				page_to_phys(page));
+			__free_pages(page, tx_q->tx_dma32_bufs[entry].order);
+			dma_unmap_page(priv->device, dma_addr, total_len,
+				       DMA_TO_DEVICE);
+			return 0;
+		}
+	}
+	priv->swc.tx_mem_copy++;
+	priv->tx_mem_copy++;
+	tx_q->tx_dma32_bufs[entry].mem_type = DN200_DMA32;
+	tx_q->tx_dma32_bufs[entry].buf = dma_addr;
+	tx_q->tx_dma32_bufs[entry].len = total_len;
+
+	return dma_addr;
+}
+
+/**
+ *  dn200_tso_xmit - Tx entry point of the driver for oversized frames (TSO)
+ *  @skb : the socket buffer
+ *  @dev : device pointer
+ *  Description: this is the transmit function that is called on TSO frames
+ *  (support available on GMAC4 and newer chips).
+ *  Diagram below show the ring programming in case of TSO frames:
+ *
+ *  First Descriptor
+ *   --------
+ *   | DES0 |---> buffer1 = L2/L3/L4 header
+ *   | DES1 |---> TCP Payload (can continue on next descr...)
+ *   | DES2 |---> buffer 1 and 2 len
+ *   | DES3 |---> must set TSE, TCP hdr len-> [22:19]. TCP payload len [17:0]
+ *   --------
+ *	|
+ *     ...
+ *	|
+ *   --------
+ *   | DES0 | --| Split TCP Payload on Buffers 1 and 2
+ *   | DES1 | --|
+ *   | DES2 | --> buffer 1 and 2 len
+ *   | DES3 |
+ *   --------
+ *
+ * mss is fixed when enable tso, so w/o programming the TDES3 ctx field.
+ */
+static netdev_tx_t dn200_tso_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	int tunnel_flag = 0;
+	struct dma_desc *desc, *first, *mss_desc = NULL;
+	struct dn200_priv *priv = netdev_priv(dev);
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	u32 queue = skb_get_queue_mapping(skb);
+	unsigned int first_entry, entry, tx_packets;
+	int tmp_pay_len = 0, first_tx;
+	struct dn200_tx_queue *tx_q;
+	bool has_vlan, set_ic;
+	u8 proto_hdr_len, hdr;
+	u32 pay_len, mss;
+	dma_addr_t des = 0, pre_des = 0;
+	int i;
+	dma_addr_t dma32_addr;
+	u64 base_addr;
+
+	tx_q = &priv->tx_queue[queue];
+	first_tx = tx_q->cur_tx;
+
+	/* Compute header lengths */
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		proto_hdr_len =
+		    skb_transport_offset(skb) + sizeof(struct udphdr);
+		hdr = sizeof(struct udphdr);
+	} else if (skb_shinfo(skb)->gso_type &
+		   (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM)) {
+		/* get inner TCP segmentation header length */
+		hdr = inner_tcp_hdrlen(skb);
+		proto_hdr_len = skb_inner_transport_offset(skb) + hdr;
+		tunnel_flag = TSO_DESC_IS_TUNNEL;
+	} else {
+		proto_hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		hdr = tcp_hdrlen(skb);
+	}
+
+	/* Desc availability based on threshold should be enough safe */
+	if (unlikely(dn200_tx_avail(priv, queue) <
+		     (((skb->len - proto_hdr_len) / TSO_MAX_BUFF_SIZE + 1)))) {
+		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, queue))) {
+			netif_tx_stop_queue(netdev_get_tx_queue(priv->dev,
+								queue));
+			/* This is a hard error, log it. */
+			netdev_err(priv->dev,
+				   "%s: Tx Ring full when queue awake\n",
+				   __func__);
+		}
+		return NETDEV_TX_BUSY;
+	}
+
+	pay_len = skb_headlen(skb) - proto_hdr_len;	/* no frags */
+
+	mss = skb_shinfo(skb)->gso_size;
+
+	/* set new MSS value if needed */
+	if (mss != tx_q->mss) {
+		mss_desc = &tx_q->dma_tx[tx_q->cur_tx];
+
+		dn200_set_mss(priv, mss_desc, mss);
+		tx_q->mss = mss;
+		tx_q->cur_tx = DN200_GET_ENTRY(tx_q->cur_tx, priv->dma_tx_size);
+		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
+	}
+
+	if (netif_msg_tx_queued(priv)) {
+		dev_info(priv->device,
+			 "%s: hdrlen %d, hdr_len %d, pay_len %d, mss %d\n",
+			 __func__, hdr, proto_hdr_len, pay_len, mss);
+		dev_info(priv->device, "\tskb->len %d, skb->data_len %d\n",
+			 skb->len, skb->data_len);
+	}
+
+	/* Check if VLAN can be inserted by HW */
+	has_vlan = dn200_vlan_insert(priv, skb, tx_q);
+
+	first_entry = tx_q->cur_tx;
+	WARN_ON(tx_q->tx_skbuff[first_entry]);
+
+	desc = &tx_q->dma_tx[first_entry];
+	first = desc;
+
+	if (has_vlan)
+		dn200_set_desc_vlan(priv, first, DN200_VLAN_INSERT);
+
+	/* first descriptor: fill Headers on Buf1 */
+	des = dma_map_single(priv->device, skb->data, skb_headlen(skb),
+			     DMA_TO_DEVICE);
+	if (dma_mapping_error(priv->device, des))
+		goto dma_map_err;
+	tx_q->tx_skbuff_dma[first_entry].buf = des;
+	tx_q->tx_skbuff_dma[first_entry].len = skb_headlen(skb);
+	tx_q->tx_skbuff_dma[first_entry].map_as_page = false;
+	tx_q->tx_skbuff_dma[first_entry].buf_type = DN200_TXBUF_T_SKB;
+
+	dma32_addr = 0;
+	if (!dma_can_direct_use(priv, des)) {
+		if (dn200_tx_iatu_find
+		    (des, tx_q, &tx_q->tx_skbuff_dma[first_entry].iatu_ref_ptr,
+		     &base_addr) < 0) {
+			dev_dbg(priv->device, "%s %d des %llx\n", __func__,
+				__LINE__, des >> 32);
+			dma32_addr =
+			    dn200_dma32_allocator(priv, skb->data,
+						  skb_headlen(skb), queue,
+						  first_entry, &base_addr);
+			if (dma32_addr) {
+				des = dma32_addr;
+			} else {
+				/* use 64bits as DMA mask, hw limit is 40 or 32 bits,
+				 * so the prev dma map output 48bit dma addr can't be used,
+				 * just go out
+				 */
+				goto dma_map_err;
+			}
+		}
+
+		if (!dma32_addr)
+			des = (des & LIMIT_MASK) | base_addr;
+		else
+			des = (des & MAX_LIMIT_MASK) | base_addr;
+	}
+
+	dn200_set_desc_addr(priv, first, des, priv->hw);
+	tmp_pay_len = pay_len;
+	des += proto_hdr_len;
+	pay_len = 0;
+
+	dn200_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
+
+	/* Prepare fragments */
+	for (i = 0; i < nfrags; i++) {
+		unsigned int cur_tx = tx_q->cur_tx;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		atomic_t *iatu_ref_ptr = NULL;
+
+		des = skb_frag_dma_map(priv->device, frag, 0,
+				       skb_frag_size(frag), DMA_TO_DEVICE);
+		if (dma_mapping_error(priv->device, des))
+			goto dma_map_err;
+
+		pre_des = des;
+		dma32_addr = 0;
+		if (!dma_can_direct_use(priv, des)) {
+			if (dn200_tx_iatu_find
+			    (des, tx_q, &iatu_ref_ptr, &base_addr) < 0) {
+				void *data =
+				    page_address(skb_frag_page(frag)) +
+				    skb_frag_off(frag);
+				dev_dbg(priv->device, "%s %d des %llx\n",
+					__func__, __LINE__, des);
+				cur_tx =
+				    DN200_GET_ENTRY(cur_tx, priv->dma_tx_size);
+				dma32_addr =
+				    dn200_dma32_allocator(priv, data,
+							  skb_frag_size(frag),
+							  queue, cur_tx,
+							  &base_addr);
+				if (dma32_addr) {
+					des = dma32_addr;
+				} else {
+					/* use 64bits as DMA mask, hw limit is 40 or 32 bits,
+					 * so the prev dma map output 48bit dma addr can't be used,
+					 * just go out
+					 */
+					goto dma_map_err;
+				}
+			}
+			if (!dma32_addr)
+				des = (des & LIMIT_MASK) | base_addr;
+			else
+				des = (des & MAX_LIMIT_MASK) | base_addr;
+		}
+		dn200_tso_allocator(priv, des, skb_frag_size(frag),
+				    (i == nfrags - 1), queue);
+
+		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = pre_des;
+		tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
+		tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
+		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = DN200_TXBUF_T_SKB;
+		tx_q->tx_skbuff_dma[tx_q->cur_tx].iatu_ref_ptr = iatu_ref_ptr;
+
+		if (dma32_addr) {
+			/* move dma32 tx buffer from previous cur_tx to current cur_tx
+			 * that is updated in tso allocator
+			 */
+			if (cur_tx != tx_q->cur_tx) {
+				tx_q->tx_dma32_bufs[tx_q->cur_tx] =
+				    tx_q->tx_dma32_bufs[cur_tx];
+				tx_q->tx_dma32_bufs[cur_tx].mem_type =
+				    DN200_NORMAL;
+				tx_q->tx_dma32_bufs[cur_tx].iatu_ref_ptr = NULL;
+			}
+		}
+	}
+
+	tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
+
+	/* Only the last descriptor gets to point to the skb. */
+	tx_q->tx_skbuff[tx_q->cur_tx] = skb;
+	tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = DN200_TXBUF_T_SKB;
+
+	/* Manage tx mitigation */
+	desc = &tx_q->dma_tx[tx_q->cur_tx];
+	tx_packets = dn200_ring_entries_calc(priv->dma_tx_size, first_tx, tx_q->cur_tx);
+	tx_q->tx_count_frames += tx_packets;
+	priv->tx_intr[queue].packet += tx_packets;
+
+	if ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)
+		set_ic = true;
+	else if (!priv->tx_coal_frames[queue])
+		set_ic = false;
+	else if (tx_packets > priv->tx_coal_frames[queue])
+		set_ic = true;
+	else if ((tx_q->tx_count_frames %
+		  priv->tx_coal_frames[queue]) < tx_packets)
+		set_ic = true;
+	else
+		set_ic = false;
+
+	if (set_ic) {
+		tx_q->tx_count_frames = 0;
+		dn200_set_tx_ic(priv, desc);
+		priv->xstats.tx_set_ic_bit++;
+	}
+
+	/* We've used all descriptors we need for this skb, however,
+	 * advance cur_tx so that it references a fresh descriptor.
+	 * ndo_start_xmit will fill this descriptor the next time it's
+	 * called and dn200_tx_clean may clean up to this descriptor.
+	 */
+	tx_q->cur_tx = DN200_GET_ENTRY(tx_q->cur_tx, priv->dma_tx_size);
+
+	if (unlikely(dn200_tx_avail(priv, queue) <= (MAX_SKB_FRAGS + 1))) {
+		netif_dbg(priv, hw, priv->dev, "%s: stop transmitted packets\n",
+			  __func__);
+		netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue));
+	}
+
+	dev->stats.tx_bytes += skb->len;
+	priv->tx_intr[queue].bytes += skb->len;
+	priv->xstats.tx_tso_frames++;
+	priv->xstats.tx_tso_nfrags += nfrags;
+	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
+
+	if (priv->sarc_type)
+		dn200_set_desc_sarc(priv, first, priv->sarc_type);
+
+	skb_tx_timestamp(skb);
+
+	if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
+		     priv->hwts_tx_en)) {
+		/* declare that device is doing timestamping */
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		dn200_enable_tx_timestamp(priv, first);
+	}
+	/* Complete the first descriptor before granting the DMA */
+	dn200_prepare_tso_tx_desc(priv, first, TSO_DESC_IS_FIRST | tunnel_flag,
+				  proto_hdr_len,
+				  pay_len,
+				  1,
+				  tx_q->tx_skbuff_dma[first_entry].last_segment,
+				  hdr / 4, (skb->len - proto_hdr_len));
+
+	/* If context desc is used to change MSS */
+	if (mss_desc) {
+		/* Make sure that first descriptor has been completely
+		 * written, including its own bit. This is because MSS is
+		 * actually before first descriptor, so we need to make
+		 * sure that MSS's own bit is the last thing written.
+		 */
+		dma_wmb();
+		dn200_set_tx_owner(priv, mss_desc);
+	}
+
+	if (netif_msg_pktdata(priv)) {
+		dev_info(priv->device,
+			 "%s: curr=%d dirty=%d f=%d, e=%d, f_p=%p, nfrags %d\n",
+			 __func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry,
+			 tx_q->cur_tx, first, nfrags);
+		dev_info(priv->device, ">>> frame to be transmitted: ");
+		print_pkt(skb->data, skb_headlen(skb));
+	}
+
+	dn200_flush_tx_descriptors(priv, queue);
+	if (atomic_read(&tx_q->txtimer_running) == 0)
+		dn200_tx_timer_arm(priv, queue);
+	/* Make sure that data has been completely written. */
+	tx_q->next_to_watch = tx_q->cur_tx;
+	return NETDEV_TX_OK;
+dma_map_err:
+	entry = tx_q->cur_tx;
+	for (;;) {
+		tx_q->tx_skbuff[entry] = NULL;
+		desc = tx_q->dma_tx + entry;
+		dn200_unmap_txbuff(priv, tx_q, entry);
+		dn200_free_dma32_tx_buffer(priv, tx_q, entry);
+		dn200_release_tx_desc(priv, desc, priv->mode);
+		if (entry == first_entry)
+			break;
+
+		entry = DN200_GET_PREVENTRY(entry, priv->dma_tx_size);
+	}
+	if (has_vlan) {
+		entry = DN200_GET_PREVENTRY(entry, priv->dma_tx_size);
+		desc = tx_q->dma_tx + entry;
+		dn200_release_tx_desc(priv, desc, priv->mode);
+	}
+	tx_q->cur_tx = entry;
+	dev_err(priv->device, "Tx dma map failed\n");
+	dev_kfree_skb(skb);
+	priv->dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
+/**
+ *  dn200_xmit - Tx entry point of the driver
+ *  @skb : the socket buffer
+ *  @dev : device pointer
+ *  Description : this is the tx entry point of the driver.
+ *  It programs the chain or the ring and supports oversized frames
+ *  and SG feature.
+ */
+static netdev_tx_t dn200_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	unsigned int first_entry, tx_packets, enh_desc;
+	struct dn200_priv *priv = netdev_priv(dev);
+	unsigned int nopaged_len = skb_headlen(skb);
+	int i, csum_insertion = 0, is_jumbo = 0;
+	u32 queue = skb_get_queue_mapping(skb);
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	int gso = skb_shinfo(skb)->gso_type;
+	struct dma_desc *desc, *first;
+	struct dn200_tx_queue *tx_q;
+	bool has_vlan, set_ic, iatu_lack = false;
+	int entry, first_tx;
+	dma_addr_t des;
+	dma_addr_t dma32_addr;
+	u64 base_addr;
+
+	if (test_bit(DN200_DOWN, &priv->state))
+		return NETDEV_TX_BUSY;
+	tx_q = &priv->tx_queue[queue];
+	first_tx = tx_q->cur_tx;
+	if (priv->tx_path_in_lpi_mode && priv->eee_sw_timer_en)
+		dn200_disable_eee_mode(priv);
+	/* Manage oversized TCP frames for GMAC4 device */
+	if (skb_is_gso(skb) && priv->tso) {
+		if (gso & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
+			return dn200_tso_xmit(skb, dev);
+	}
+
+	if (unlikely(dn200_tx_avail(priv, queue) < nfrags + 1)) {
+		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, queue))) {
+			netif_tx_stop_queue(netdev_get_tx_queue(priv->dev,
+								queue));
+			/* This is a hard error, log it. */
+			netdev_err(priv->dev,
+				   "%s: Tx Ring full when queue awake\n",
+				   __func__);
+		}
+		return NETDEV_TX_BUSY;
+	}
+	if (netif_msg_tx_queued(priv)) {
+		dev_info(priv->device,
+			 "%s, %d: Tx Ring avial:%d, need:%d\n",
+			 __func__, __LINE__, dn200_tx_avail(priv, queue),
+			 nfrags);
+		dev_info(priv->device, "\tskb->len %d, skb->data_len %d\n",
+			 skb->len, skb->data_len);
+	}
+	/* Check if VLAN can be inserted by HW */
+	has_vlan = dn200_vlan_insert(priv, skb, tx_q);
+
+	entry = tx_q->cur_tx;
+	first_entry = entry;
+	WARN_ON(tx_q->tx_skbuff[first_entry]);
+
+	csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL);
+	desc = tx_q->dma_tx + entry;
+
+	first = desc;
+
+	if (has_vlan)
+		dn200_set_desc_vlan(priv, first, DN200_VLAN_INSERT);
+	if (dn200_is_vxlan(skb))
+		dn200_set_vxlan(priv, first);
+
+	enh_desc = priv->plat->enh_desc;
+	/* To program the descriptors according to the size of the frame */
+	if (enh_desc)
+		is_jumbo = dn200_is_jumbo_frm(priv, skb->len, enh_desc);
+
+	if (unlikely(is_jumbo)) {
+		entry = dn200_jumbo_frm(priv, tx_q, skb, csum_insertion);
+		if (unlikely(entry < 0) && (entry != -EINVAL))
+			goto dma_map_err;
+	}
+
+	for (i = 0; i < nfrags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		int len = skb_frag_size(frag);
+		bool last_segment = (i == (nfrags - 1));
+
+		entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+		WARN_ON(tx_q->tx_skbuff[entry]);
+		desc = tx_q->dma_tx + entry;
+
+		des = skb_frag_dma_map(priv->device, frag, 0, len,
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(priv->device, des))
+			goto dma_map_err;	/* should reuse desc w/o issues */
+		tx_q->tx_skbuff_dma[entry].buf = des;
+		tx_q->tx_skbuff_dma[entry].map_as_page = true;
+		tx_q->tx_skbuff_dma[entry].len = len;
+		tx_q->tx_skbuff_dma[entry].last_segment = last_segment;
+		tx_q->tx_skbuff_dma[entry].buf_type = DN200_TXBUF_T_SKB;
+
+		dma32_addr = 0;
+		if (!dma_can_direct_use(priv, des)) {
+			if (dn200_tx_iatu_find
+			    (des, tx_q,
+			     &tx_q->tx_skbuff_dma[entry].iatu_ref_ptr,
+			     &base_addr) < 0) {
+				void *data =
+				    page_address(skb_frag_page(frag)) +
+				    skb_frag_off(frag);
+				iatu_lack = true;
+				dma32_addr =
+				    dn200_dma32_allocator(priv, data, len,
+							  queue, entry,
+							  &base_addr);
+				if (dma32_addr) {
+					des = dma32_addr;
+				} else {
+					/* use 64bits as DMA mask, hw limit is 40 or 32 bits,
+					 * so the prev dma map output 48bit dma addr can't be used,
+					 * just go out
+					 */
+					goto dma_map_err;
+				}
+			}
+
+			if (!dma32_addr)
+				des = (des & LIMIT_MASK) | base_addr;
+			else
+				des = (des & MAX_LIMIT_MASK) | base_addr;
+		}
+		dn200_set_desc_addr(priv, desc, des, priv->hw);
+		/* Prepare the descriptor and set the own bit too */
+		dn200_prepare_tx_desc(priv, desc, 0, len, csum_insertion,
+					priv->mode, 1, last_segment, skb->len);
+	}
+
+	/* Only the last descriptor gets to point to the skb. */
+	tx_q->tx_skbuff[entry] = skb;
+	tx_q->tx_skbuff_dma[entry].buf_type = DN200_TXBUF_T_SKB;
+
+	/* According to the coalesce parameter the IC bit for the latest
+	 * segment is reset and the timer re-started to clean the tx status.
+	 * This approach takes care about the fragments: desc is the first
+	 * element in case of no SG.
+	 */
+	tx_packets = dn200_ring_entries_calc(priv->dma_tx_size, first_tx, entry);
+	tx_q->tx_count_frames += tx_packets;
+	priv->tx_intr[queue].packet += tx_packets;
+
+	if ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)
+		set_ic = true;
+	else if (!priv->tx_coal_frames[queue])
+		set_ic = false;
+	else if (tx_packets > priv->tx_coal_frames[queue])
+		set_ic = true;
+	else if ((tx_q->tx_count_frames %
+		  priv->tx_coal_frames[queue]) < tx_packets)
+		set_ic = true;
+	else
+		set_ic = false;
+
+	if (set_ic || iatu_lack) {
+		desc = &tx_q->dma_tx[entry];
+
+		tx_q->tx_count_frames = 0;
+		dn200_set_tx_ic(priv, desc);
+		priv->xstats.tx_set_ic_bit++;
+	}
+
+	/* We've used all descriptors we need for this skb, however,
+	 * advance cur_tx so that it references a fresh descriptor.
+	 * ndo_start_xmit will fill this descriptor the next time it's
+	 * called and dn200_tx_clean may clean up to this descriptor.
+	 */
+	entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+	tx_q->cur_tx = entry;
+
+	if (netif_msg_pktdata(priv)) {
+		netdev_info(priv->dev,
+			    "%s: curr=%d dirty=%d f=%d, e=%d, first=%p, nfrags=%d",
+			    __func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry,
+			    entry, first, nfrags);
+
+		netdev_info(priv->dev, ">>> frame to be transmitted: ");
+		print_pkt(skb->data, skb->len);
+	}
+	if (unlikely(dn200_tx_avail(priv, queue) <= (MAX_SKB_FRAGS + 1))) {
+		netif_dbg(priv, hw, priv->dev, "%s: stop transmitted packets\n",
+			  __func__);
+		netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue));
+	}
+
+	dev->stats.tx_bytes += skb->len;
+	priv->tx_intr[queue].bytes += skb->len;
+	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
+	if (priv->sarc_type)
+		dn200_set_desc_sarc(priv, first, priv->sarc_type);
+
+	skb_tx_timestamp(skb);
+
+	/* Ready to fill the first descriptor and set the OWN bit w/o any
+	 * problems because all the descriptors are actually ready to be
+	 * passed to the DMA engine.
+	 */
+	if (likely(!is_jumbo)) {
+		bool last_segment = (nfrags == 0);
+
+		des = dma_map_single(priv->device, skb->data,
+				     nopaged_len, DMA_TO_DEVICE);
+		if (dma_mapping_error(priv->device, des))
+			goto dma_map_err;
+
+		tx_q->tx_skbuff_dma[first_entry].buf = des;
+		tx_q->tx_skbuff_dma[first_entry].buf_type = DN200_TXBUF_T_SKB;
+		tx_q->tx_skbuff_dma[first_entry].map_as_page = false;
+		tx_q->tx_skbuff_dma[first_entry].len = nopaged_len;
+		tx_q->tx_skbuff_dma[first_entry].last_segment = last_segment;
+
+		dma32_addr = 0;
+		if (!dma_can_direct_use(priv, des)) {
+			if (dn200_tx_iatu_find
+			    (des, tx_q,
+			     &tx_q->tx_skbuff_dma[first_entry].iatu_ref_ptr,
+			     &base_addr) < 0) {
+				iatu_lack = true;
+				dma32_addr =
+				    dn200_dma32_allocator(priv, skb->data,
+							  nopaged_len, queue,
+							  first_entry,
+							  &base_addr);
+				if (dma32_addr) {
+					des = dma32_addr;
+				} else {
+					/* use 64bits as DMA mask, hw limit is 40 or 32 bits,
+					 * so the prev dma map output 48bit dma addr can't be used,
+					 * just go out
+					 */
+					dev_err(priv->device,
+						"%s %d dma32_allocator failed\n",
+						__func__, __LINE__);
+					goto dma_map_err;
+				}
+			}
+
+			/* 1. for normal addr: the iatu limit can be changed
+			 * 2. for dam32 addr: the iatu limit can not be changed, force to use 32bit
+			 * you can change LIMIT_MASK(26 or 28 bit) to verify
+			 * different address range (e.g. 64MB, 256MB) for one iATU to cover
+			 */
+			if (!dma32_addr)
+				des = (des & LIMIT_MASK) | base_addr;
+			else
+				des = (des & MAX_LIMIT_MASK) | base_addr;
+		}
+		dn200_set_desc_addr(priv, first, des, priv->hw);
+
+		if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
+			     priv->hwts_tx_en)) {
+			/* declare that device is doing timestamping */
+			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+			dn200_enable_tx_timestamp(priv, first);
+		}
+
+		/* Prepare the first descriptor setting the OWN bit too */
+		dn200_prepare_tx_desc(priv, first, 1, nopaged_len,
+				      csum_insertion, priv->mode, 0,
+				      last_segment, skb->len);
+	}
+
+	dn200_set_tx_owner(priv, first);
+
+	dn200_enable_dma_transmission(priv, priv->ioaddr);
+	if (netif_xmit_stopped(netdev_get_tx_queue(priv->dev, queue)) || !netdev_xmit_more()) {
+		dn200_flush_tx_descriptors(priv, queue);
+		if (atomic_read(&tx_q->txtimer_running) == 0)
+			dn200_tx_timer_arm(priv, queue);
+	}
+	/* Make sure that data has been completely written. */
+	tx_q->next_to_watch = tx_q->cur_tx;
+	return NETDEV_TX_OK;
+dma_map_err:
+	for (;;) {
+		tx_q->tx_skbuff[entry] = NULL;
+		desc = tx_q->dma_tx + entry;
+		dn200_unmap_txbuff(priv, tx_q, entry);
+		dn200_free_dma32_tx_buffer(priv, tx_q, entry);
+		dn200_release_tx_desc(priv, desc, priv->mode);
+		if (entry == first_entry)
+			break;
+
+		entry = DN200_GET_PREVENTRY(entry, priv->dma_tx_size);
+	}
+
+	if (has_vlan) {
+		entry = DN200_GET_PREVENTRY(entry, priv->dma_tx_size);
+		desc = tx_q->dma_tx + entry;
+		dn200_release_tx_desc(priv, desc, priv->mode);
+	}
+	tx_q->cur_tx = entry;
+	netdev_err(priv->dev, "Tx DMA map failed\n");
+	dev_kfree_skb(skb);
+	priv->dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
+static void dn200_rx_vlan(struct dn200_priv *priv, struct sk_buff *skb,
+			  struct dma_desc *p)
+{
+	u16 vlanid;
+
+	if ((priv->dev->features & NETIF_F_HW_VLAN_CTAG_RX) || PRIV_IS_VF(priv)) {
+		vlanid = dn200_get_ovt(priv, p);
+		if (!vlanid)
+			return;
+
+		priv->swc.mmc_rx_vlan_strip++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+	}
+}
+
+/* Base on HW curr ptr(pidx) and tail ptr to debug the ring state:
+ * Ring empty: curr ptr == tail
+ * Ring full: avail_desc < MIN_RX_FREE_DES
+ */
+
+/**
+ * dn200_rx_refill - refill used skb preallocated buffers
+ * @priv: driver private structure
+ * @queue: RX queue index
+ * Description : this is to reallocate the skb for the reception process
+ * that is based on zero-copy.
+ *
+ * Returns false if all allocations were successful, true if any fail. Returning
+ * true signals to the caller that we didn't replace cleaned_count buffers and
+ * there is more work to do.
+ */
+static inline bool dn200_rx_refill(struct dn200_priv *priv, u32 queue,
+				   u32 q_depth, int cleaned_count)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	int dirty = dn200_rx_dirty(priv, queue);
+	unsigned int entry = rx_q->dirty_rx;
+	struct dn200_rx_buffer *buf;
+	struct dma_desc *p;
+
+	if (unlikely(dirty == 0 && cleaned_count > 0)) {
+		dev_err(priv->device,
+			"%s, %d, rx ring %d is abnormal, dirty is 0, but cleaned %d, cur rx:%d, dirty rx:%d, ring depth:%d\n",
+			__func__, __LINE__, queue, cleaned_count, rx_q->cur_rx,
+			rx_q->dirty_rx, priv->dma_rx_size);
+	}
+
+	/* do nothing if no valid cleaned_count */
+	if (!cleaned_count)
+		return false;
+	dirty = cleaned_count;
+
+	while (dirty > 0) {
+		buf = &rx_q->buf_pool[entry];
+		p = rx_q->dma_rx + entry;
+
+		if (dn200_alloc_page
+		    (priv, rx_q, buf, FIRST_PAGE, entry, q_depth)) {
+			netdev_err(priv->dev,
+				   "alloc page failure for rx buf\n");
+			break;
+		}
+
+		/* sync the buffer for use by the device */
+		dma_sync_single_range_for_device(priv->device, buf->kernel_addr,
+						 buf->page_offset,
+						 priv->dma_buf_sz,
+						 DMA_FROM_DEVICE);
+		dn200_set_desc_addr(priv, p, buf->desc_addr + buf->page_offset,
+				    priv->hw);
+
+		dn200_set_desc_sec_addr(priv, p, 0, false, priv->hw);
+
+		dn200_refill_desc3(priv, rx_q, p);
+
+		dma_wmb();
+		dn200_set_rx_owner(priv, p, 0);
+		buf->rx_times = 0;
+
+		entry = DN200_GET_ENTRY(entry, q_depth);
+		dirty--;
+	}
+	/*use old tail addr to debug rx ring state */
+
+	rx_q->dirty_rx = entry;
+
+	if (unlikely(rx_q->dirty_rx == rx_q->cur_rx))
+		dev_err(priv->device,
+			"%s, %d, must not happen (dirty rx == curr rx), dirty:%d, cur rx:%d\n",
+			__func__, __LINE__, rx_q->dirty_rx, rx_q->cur_rx);
+
+	rx_q->rx_tail_addr = rx_q->dma_rx_phy +
+	    (rx_q->dirty_rx * sizeof(struct dma_desc));
+	dn200_set_rx_tail_ptr(priv, priv->ioaddr, rx_q->rx_tail_addr, queue,
+			      priv->hw);
+
+	return !!dirty;
+}
+
+static unsigned int dn200_rx_buf1_len(struct dn200_priv *priv,
+				      struct dma_desc *p,
+				      int status, unsigned int len)
+{
+	unsigned int plen = 0;
+	int coe = priv->hw->rx_csum;
+
+	/* First descriptor, not last descriptor and not split header */
+	if (status & rx_not_ls)
+		return priv->dma_buf_sz;
+
+	plen = dn200_get_rx_frame_len(priv, p, coe);
+
+	plen = plen - len;
+
+	/* First descriptor and last descriptor and not split header */
+	return plen;
+}
+
+#define DN200_DESC_UNUSED(priv, rx_q)                                 \
+	((((rx_q)->cur_rx > (rx_q)->dirty_rx) ? 0 : (priv)->dma_rx_size) + \
+	 (rx_q)->cur_rx - (rx_q)->dirty_rx - 1)
+
+
+static inline void
+dn200_get_ntuple_filter_num(struct dn200_priv *priv, struct dma_desc *desc,
+			    u8 *status, u32 *ntuple_drop)
+{
+	u8 filter_no = 0;
+
+	if (!(priv->dev->features & NETIF_F_NTUPLE))
+		return;
+
+	if (desc->des2 & (XGMAC_RDES2_L4FM | XGMAC_RDES2_L3FM)) {
+		filter_no =
+		    (desc->des2 & XGMAC_RDES2_MADRM) >> XGMAC_RDES2_MADRM_SHIFT;
+		if ((1 << filter_no) & priv->fdir_map) {
+			*status |= discard_frame;
+			priv->swc.mmc_rx_fd_drop++;
+			(*ntuple_drop)++;
+		}
+	}
+}
+
+/**
+ * dn200_rx - manage the receive process
+ * @priv: driver private structure
+ * @limit: napi bugget
+ * @queue: RX queue index.
+ * Description :  this the function called by the napi poll method.
+ * It gets all the frames inside the ring.
+ */
+static int dn200_rx(struct dn200_priv *priv, int limit, u32 queue, u32 q_depth)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned int count = 0, error = 0, len = 0, ntuple_drop = 0;
+	int status = 0, coe = priv->hw->rx_csum;
+	u8 filter_status = 0;
+	unsigned int next_entry = rx_q->cur_rx;
+	enum dma_data_direction dma_dir;
+	struct sk_buff *skb = NULL;
+	struct page *page = NULL;
+	unsigned char *hard_start;
+	int cleaned_count = DN200_DESC_UNUSED(priv, rx_q);
+	bool is_failure = false;
+	void *buf_addr = NULL;
+
+	dma_dir = DMA_FROM_DEVICE;
+
+	if (netif_msg_rx_status(priv)) {
+		void *rx_head;
+		u16 desc_size;
+
+		netdev_info(priv->dev, "%s: descriptor ring:\n", __func__);
+		rx_head = (void *)rx_q->dma_rx;
+		desc_size = sizeof(struct dma_desc);
+
+		dn200_display_ring(priv, rx_head, q_depth, true,
+				   rx_q->dma_rx_phy, desc_size, priv->hw);
+	}
+	if (cleaned_count >= dn200_rx_refill_size(priv)) {
+		is_failure =
+			dn200_rx_refill(priv, queue, q_depth, cleaned_count);
+
+		if (unlikely(is_failure)) {
+			dev_err(priv->device,
+				"%s %d rx refill failure %d\n",
+				__func__, __LINE__, is_failure);
+			goto err_refill_fail;
+		}
+		cleaned_count = 0;
+	}
+
+	while (count < limit) {
+		unsigned int buf1_len = 0, buf2_len = 0;
+		enum pkt_hash_types hash_type;
+		struct dn200_rx_buffer *buf;
+		struct dma_desc *np, *p;
+		int entry;
+		u32 hash;
+
+		if (unlikely(!count && rx_q->state_saved)) {
+			skb = rx_q->state.skb;
+			error = rx_q->state.error;
+			len = rx_q->state.len;
+		} else {
+			rx_q->state_saved = false;
+			skb = NULL;
+			error = 0;
+			len = 0;
+		}
+
+		if (count >= limit)
+			break;
+
+		/* return some buffers to hardware
+		 * fix issue: in the condition of 64 depth of desc ring,
+		 * napi budget is more than available desc number(63)
+		 * will have opportunity recv packet twice from
+		 * one rx desc(almost always happen when rx burst)
+		 */
+		if (cleaned_count >= dn200_rx_refill_size(priv)) {
+			is_failure =
+			    dn200_rx_refill(priv, queue, q_depth, cleaned_count);
+
+			if (unlikely(is_failure)) {
+				dev_err(priv->device,
+					"%s %d rx refill failure %d\n",
+					__func__, __LINE__, is_failure);
+				goto err_refill_fail;
+			}
+			cleaned_count = 0;
+		}
+
+read_again:
+		buf1_len = 0;
+		buf2_len = 0;
+		entry = next_entry;
+		buf = &rx_q->buf_pool[entry];
+		p = rx_q->dma_rx + entry;
+		/* read the status of the incoming frame */
+		status = dn200_rx_status(priv, &priv->dev->stats,
+					 &priv->xstats, p, priv->rec_all);
+		/* check if managed by the DMA otherwise go ahead */
+		if (unlikely(status & dma_own))
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we know the
+		 * dma_own bit is set.
+		 */
+		dma_rmb();
+		rx_q->cur_rx = DN200_GET_ENTRY(rx_q->cur_rx, q_depth);
+		next_entry = rx_q->cur_rx;
+		np = rx_q->dma_rx + next_entry;
+		prefetch(np);
+
+		dn200_get_ntuple_filter_num(priv, p, &filter_status,
+					    &ntuple_drop);
+		buf1_len = dn200_rx_buf1_len(priv, p, status, len);
+		if (buf1_len > priv->dma_buf_sz) {
+			dev_err(priv->device,
+					"%s, %d, invalid buf len %d.\n",
+					__func__, __LINE__, buf1_len);
+			status |= buf_len_err;
+		}
+		if (unlikely
+		    (status & (discard_frame | buf_len_err) ||
+				 filter_status == discard_frame)) {
+			error = 1;
+			if (!priv->hwts_rx_en)
+				priv->dev->stats.rx_errors++;
+		}
+
+		if (unlikely(error && (status & rx_not_ls))) {
+			cleaned_count++;
+			dn200_rx_pool_buf_free(rx_q->rx_pool, rx_q->queue_index,
+					       buf->pg_buf);
+			goto read_again;
+		}
+		if (unlikely(error)) {
+			if (skb) {
+				dev_kfree_skb(skb);
+				skb = NULL;
+			}
+			count++;
+			cleaned_count++;
+			dn200_rx_pool_buf_free(rx_q->rx_pool, rx_q->queue_index,
+					       buf->pg_buf);
+			if (status & buf_len_err) {
+				netdev_err(priv->dev, "buf len err %d\n", buf1_len);
+				set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+				set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+				dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+				break;
+			}
+			continue;
+		}
+
+		/* Buffer is good. Go on. */
+		/* page_offset can be the first or sencond part of the page */
+		buf_addr = page_address(buf->page) + buf->page_offset;
+		hard_start = buf_addr - dn200_rx_offset(priv);
+		net_prefetch(buf_addr);
+		len += buf1_len;
+		if (!skb) {
+			if (unlikely(buf->pg_buf->low_res)) {
+				if (priv->txrx_itr_combined)
+					skb =
+					    napi_alloc_skb(&ch->agg_napi,
+							   buf1_len);
+				else
+					skb =
+					    napi_alloc_skb(&ch->rx_napi,
+							   buf1_len);
+				if (unlikely(!skb)) {
+					dev_err(priv->device,
+						"%s, %d, build skb fail.\n",
+						__func__, __LINE__);
+					goto drain_data;
+				}
+				skb_copy_to_linear_data(skb,
+							(unsigned char
+							 *)(hard_start +
+							    dn200_rx_offset
+							    (priv)), buf1_len);
+				priv->swc.rx_mem_copy++;
+			} else {
+				skb = build_skb(hard_start, DN200_RX_BUF_SIZE);
+				if (unlikely(!skb)) {
+					dev_err(priv->device,
+						"%s, %d, build skb fail.\n",
+						__func__, __LINE__);
+					goto drain_data;
+				}
+				skb_reserve(skb, dn200_rx_offset(priv));
+			}
+			skb_put(skb, buf1_len);
+			dn200_rx_pool_buf_free(rx_q->rx_pool, rx_q->queue_index,
+					       buf->pg_buf);
+			cleaned_count++;
+		} else if (buf1_len) {
+			if (unlikely(buf->pg_buf->low_res)) {
+				page =
+				    dev_alloc_pages(dn200_rx_pg_order_get
+						    (priv));
+				if (unlikely(!page)) {
+					dev_err(priv->device,
+						"%s, %d, build frag fail.\n",
+						__func__, __LINE__);
+					dev_kfree_skb(skb);
+					goto drain_data;
+				}
+				priv->swc.rx_mem_copy++;
+				memcpy(page_to_virt(page), buf_addr, buf1_len);
+				skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+						page, 0, buf1_len, PAGE_SIZE);
+			} else {
+				dma_sync_single_range_for_cpu(priv->device,
+							      buf->kernel_addr,
+							      buf->page_offset,
+							      buf1_len,
+							      dma_dir);
+				skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+						buf->page, buf->page_offset,
+						buf1_len, priv->dma_buf_sz);
+			}
+			dn200_rx_pool_buf_free(rx_q->rx_pool, rx_q->queue_index,
+					       buf->pg_buf);
+			cleaned_count++;
+		}
+
+drain_data:
+		if (unlikely(status & rx_not_ls))
+			goto read_again;
+		if (!skb)
+			continue;
+		/* Got entire packet into SKB. Finish it. */
+		dn200_get_rx_hwtstamp(priv, p, np, skb);
+		dn200_rx_vlan(priv, skb, p);
+		skb->protocol = eth_type_trans(skb, priv->dev);
+		if (unlikely(!coe))
+			skb_checksum_none_assert(skb);
+		else
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!dn200_get_rx_hash(priv, p, &hash, &hash_type))
+			skb_set_hash(skb, hash, hash_type);
+		skb_record_rx_queue(skb, queue);
+		if (priv->txrx_itr_combined)
+			napi_gro_receive(&ch->agg_napi, skb);
+		else
+			napi_gro_receive(&ch->rx_napi, skb);
+		skb = NULL;
+		priv->dev->stats.rx_packets++;
+		priv->dev->stats.rx_bytes += len;
+		count++;
+		priv->rx_intr[queue].bytes += len;
+	}
+
+	if (status & rx_not_ls || skb) {
+		rx_q->state_saved = true;
+		rx_q->state.skb = skb;
+		rx_q->state.error = error;
+		rx_q->state.len = len;
+	}
+err_refill_fail:
+	priv->rx_intr[queue].packet += count;
+	priv->xstats.rx_pkt_n += (count - ntuple_drop);
+	priv->xstats.rxq_stats[queue].rx_pkt_n += (count - ntuple_drop);
+	/* if refill status is failure,
+	 * return budget limit(e.g. 64) to put current napi to schedule list
+	 */
+	return is_failure ? limit : (int)count;
+}
+
+/**
+ *  dn200_update_ring_itr - update the dynamic ITR value based on packet size
+ *  @q_vector: pointer to q_vector
+ *
+ *  Stores a new ITR value based on strictly on packet size.  This
+ *  algorithm is less sophisticated than that used in dn200_update_itr,
+ *  due to the difficulty of synchronizing statistics across multiple
+ *  receive rings.  The divisors and thresholds used by this function
+ *  were determined based on theoretical maximum wire speed and testing
+ *  data, in order to minimize response time while increasing bulk
+ *  throughput.
+ *  This functionality is controlled by ethtool's coalescing settings.
+ *  NOTE:  This function is called only when operating in a multiqueue
+ *         receive environment.
+ **/
+static void dn200_update_1G_speed_itr(struct dn200_itr_info *itr,
+				struct dn200_priv *priv, u8 chan)
+{
+	u64 avg_wire_size = 0, packets = 0, bytes = 0, itr_usec = 0;
+	unsigned long next_update = jiffies;
+	u32 max_usec = priv->rx_itr_usec;
+	u32 min_usec = priv->rx_itr_usec_min;
+
+	/* These will do nothing if dynamic updates are not enabled */
+	if (!(itr->itr_setting & DN200_ITR_DYNAMIC_ITR))
+		return;
+
+	if (time_after(next_update, itr->next_update))
+		goto clear_counts;
+
+	packets = itr->packet;
+	bytes = itr->bytes;
+	if (!packets || !bytes) {
+		itr_usec = itr->target_itr;
+		goto clear_counts;
+	}
+
+	avg_wire_size = bytes / packets;
+
+	/* if avg_wire_size isn't set no work was done */
+	if (!avg_wire_size)
+		goto clear_counts;
+
+	/* Add 24 bytes to size to account for CRC, preamble, and gap */
+	avg_wire_size += 24;
+
+	/* Give a little boost to mid-size frames */
+	if (avg_wire_size >= 512 && avg_wire_size < 1200) {
+		itr_usec = avg_wire_size / 30;
+	} else if (avg_wire_size > 128 && avg_wire_size < 512) {
+		itr_usec = avg_wire_size / 10;
+	} else {
+		itr_usec = avg_wire_size / 5;
+		/* workaround: fix bonding iperf tx slow issue caused by
+		 * rx small packet with high interrupt frequency(low rx-usecs)
+		 * reason:
+		 * 1. tx tso pkts lead to many rx acks for one tso pkt,
+		 *    so at best receive multi ack with one interrupt,
+		 *    but just deal with one ack with one hw interrupt in high interrupt frequency
+		 * 2. tx & rx are two irqs, if aggregate together,
+		 *    will prcocess more ack in once softirq
+		 */
+		if (test_bit(DN200_IS_BONDING, &priv->state))
+			itr_usec = max_usec;
+	}
+
+	if (itr_usec > max_usec)
+		itr_usec = max_usec;
+
+clear_counts:
+	/* write back value */
+	itr->target_itr = (itr_usec & DN200_ITR_MASK);
+	if (unlikely(itr->target_itr < min_usec)) {
+		netdev_dbg(priv->dev, "invalid target itr usec:%u, rx itr usec:%u\n",
+			itr->target_itr, priv->rx_itr_usec);
+		itr->target_itr = min_usec;
+	}
+	/* next update should occur within next jiffy */
+	itr->next_update = next_update + msecs_to_jiffies(1);
+
+	itr->bytes = 0;
+	itr->packet = 0;
+}
+
+static void dn200_rx_itr_update(struct dn200_itr_info *itr,
+				struct dn200_priv *priv, u8 chan)
+{
+	u64 avg_wire_size = 0, packets = 0, bytes = 0, itr_usec;
+	unsigned long next_update = jiffies;
+	u32 max_usec = priv->rx_itr_usec;
+
+	if (priv->plat_ex->phy_info->speed == SPEED_1000)
+		max_usec = 0x20;
+
+	itr_usec = priv->min_usecs | DN200_ITR_ADAPTIVE_LATENCY;
+
+	/* These will do nothing if dynamic updates are not enabled */
+	if (!(itr->itr_setting & DN200_ITR_DYNAMIC_ITR))
+		return;
+
+	packets = itr->packet;
+	bytes = itr->bytes;
+	if (!packets || !bytes)
+		return;
+
+	if (time_after(next_update, itr->next_update))
+		goto clear_counts;
+
+	if (itr->itr_countdown) {
+		itr_usec = itr->target_itr;
+		goto clear_counts;
+	}
+
+	if (packets && packets == 1) {
+		itr_usec = itr->target_itr;
+		itr_usec &= DN200_ITR_MASK;
+		/* If packet count is 1 likely looking
+		 * at a slight overrun of the delay we want. Try halving
+		 * our delay to see if that will cut the number of packets
+		 * in half per interrupt.
+		 */
+		itr_usec >>= 2;
+		itr_usec &= DN200_ITR_MASK;
+		if (itr_usec < priv->min_usecs)
+			itr_usec = priv->min_usecs;
+		goto clear_counts;
+	} else if (packets >= 2 && bytes < 9000) {
+		itr_usec = DN200_ITR_ADAPTIVE_LATENCY;
+		goto adjust_by_size;
+	} else if (packets >= 2 && packets < 32) {
+		itr_usec = (itr->target_itr << 1);
+		if ((itr_usec & DN200_ITR_MASK) > max_usec)
+			itr_usec = max_usec;
+	} else if (packets >= 32 && packets < 56) {
+		itr_usec = (itr->target_itr + DN200_ITR_MIN_INC);
+		if ((itr_usec & DN200_ITR_MASK) > max_usec)
+			itr_usec = max_usec;
+
+		goto clear_counts;
+	} else if (packets <= 256) {
+		itr_usec = itr->target_itr;
+		itr_usec &= DN200_ITR_MASK;
+
+		/* Between 56 and 112 is our "goldilocks" zone where we are
+		 * working out "just right". Just report that our current
+		 * ITR is good for us.
+		 */
+		if (packets <= 112)
+			goto clear_counts;
+
+		/* If packet count is 128 or greater we are likely looking
+		 * at a slight overrun of the delay we want. Try halving
+		 * our delay to see if that will cut the number of packets
+		 * in half per interrupt.
+		 */
+		itr_usec -= priv->min_usecs;
+		if (itr_usec < priv->min_usecs)
+			itr_usec = priv->min_usecs;
+		goto clear_counts;
+	}
+
+adjust_by_size:
+	/* If packet counts are 256 or greater we can assume we have a gross
+	 * overestimation of what the rate should be. Instead of trying to fine
+	 * tune it just use the formula below to try and dial in an exact value
+	 * give the current packet size of the frame.
+	 */
+	avg_wire_size = bytes / packets;
+	if (avg_wire_size <= 60) {
+		/* Start at 250k ints/sec */
+		avg_wire_size = 4096;
+	} else if (avg_wire_size <= 380) {
+		/* 250K ints/sec to 60K ints/sec */
+		avg_wire_size *= 40;
+		avg_wire_size += 1696;
+	} else if (avg_wire_size <= 1084) {
+		/* 60K ints/sec to 36K ints/sec */
+		avg_wire_size *= 15;
+		avg_wire_size += 11452;
+	} else if (avg_wire_size <= 1980) {
+		/* 36K ints/sec to 30K ints/sec */
+		avg_wire_size *= 5;
+		avg_wire_size += 22420;
+	} else {
+		/* plateau at a limit of 30K ints/sec */
+		avg_wire_size = 32256;
+	}
+
+	/* If we are in low latency mode halve our delay which doubles the
+	 * rate to somewhere between 100K to 16K ints/sec
+	 */
+	if (itr_usec & DN200_ITR_ADAPTIVE_LATENCY)
+		avg_wire_size >>= 1;
+
+	/* Resultant value is 256 times larger than it needs to be. This
+	 * gives us room to adjust the value as needed to either increase
+	 * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
+	 *
+	 * Use addition as we have already recorded the new latency flag
+	 * for the ITR value.
+	 */
+	itr_usec += DIV_ROUND_UP(avg_wire_size, itr->itr_div) * 2;
+	if ((itr_usec & DN200_ITR_MASK) > max_usec) {
+		itr_usec &= DN200_ITR_ADAPTIVE_LATENCY;
+		itr_usec += max_usec;
+	}
+
+clear_counts:
+	/* write back value */
+	itr->target_itr = (itr_usec & DN200_ITR_MASK);
+	if (unlikely(itr->target_itr < priv->min_usecs)) {
+		netdev_err(priv->dev, "invalid target itr usec:%u, rx itr usec:%u\n",
+			itr->target_itr, priv->rx_itr_usec);
+		itr->target_itr = priv->min_usecs;
+	}
+
+	/* next update should occur within next jiffy */
+	itr->next_update = next_update + msecs_to_jiffies(1);
+	itr->bytes = 0;
+	itr->packet = 0;
+}
+
+static void dn200_tx_itr_update(struct dn200_priv *priv,
+				struct dn200_itr_info *itr, u8 chan)
+{
+	u64 avg_wire_size = 0, packets = 0, bytes = 0, tx_frames = 0;
+	u32 max_tx_frame = 0;
+	unsigned long next_update = jiffies;
+
+	/* These will do nothing if dynamic updates are not enabled */
+	if (!(itr->itr_setting & DN200_ITR_DYNAMIC_ITR))
+		return;
+
+	if (time_after(next_update, itr->next_update)) {
+		tx_frames = 1;
+		goto clear_counts;
+	}
+
+	if (itr->itr_countdown) {
+		tx_frames = itr->target_itr;
+		goto clear_counts;
+	}
+
+	packets = itr->packet;
+	bytes = itr->bytes;
+
+	if (!packets || !bytes) {
+		tx_frames = itr->target_itr;
+		goto clear_counts;
+	}
+
+	avg_wire_size = bytes / packets;
+	if (!avg_wire_size) {
+		tx_frames = itr->target_itr;
+		goto clear_counts;
+	}
+	if (avg_wire_size > 4096)
+		max_tx_frame = 8;
+	else if (avg_wire_size > 2048)
+		max_tx_frame = 16;
+	else if (avg_wire_size > 1024)
+		max_tx_frame = 32;
+	else if (avg_wire_size < 256)
+		max_tx_frame = 128;
+	else
+		max_tx_frame = 64;
+
+	if (priv->plat_ex->phy_info->speed == SPEED_1000)
+		max_tx_frame = max_t(u32, max_tx_frame >> 2, 1);
+
+	max_tx_frame = min((u32)(priv->dma_tx_size >> 1), (u32)max_tx_frame);
+	tx_frames = max_tx_frame;
+
+clear_counts:
+	/* write back value */
+	if (tx_frames)
+		itr->target_itr = (tx_frames & DN200_ITR_MASK);
+	/* next update should occur within next jiffy */
+	itr->next_update = next_update + msecs_to_jiffies(1);
+
+	itr->bytes = 0;
+	itr->packet = 0;
+	priv->tx_mem_copy = 0;
+}
+
+static int dn200_napi_poll_rx(struct napi_struct *napi, int budget)
+{
+	struct dn200_channel *ch =
+	    container_of(napi, struct dn200_channel, rx_napi);
+	struct dn200_priv *priv = ch->priv_data;
+	u32 chan = ch->index;
+	int work_done;
+	struct dn200_itr_info *rx_intr;
+
+	rx_intr = &priv->rx_intr[chan];
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+		 test_bit(DN200_PCIE_UNAVAILD, &priv->state))) {
+		napi_complete(napi);
+		return 0;
+	}
+	priv->xstats.napi_poll++;
+	work_done = dn200_rx(priv, budget, chan, priv->dma_rx_size);
+	if (work_done < budget && napi_complete_done(napi, work_done)) {
+		priv->dn200_update_ops->dn200_rx_itr_update(rx_intr, priv, chan);
+		if (rx_intr->current_itr != rx_intr->target_itr) {
+			/* Rx ITR needs to be increased, second priority */
+			rx_intr->current_itr = rx_intr->target_itr;
+			rx_intr->itr_countdown = ITR_COUNTDOWN_COUNT;
+		} else {
+			rx_intr->current_itr = rx_intr->target_itr;
+			/* No ITR update, lowest priority */
+			if (rx_intr->itr_countdown)
+				rx_intr->itr_countdown--;
+		}
+		dn200_rx_watchdog(priv, priv->ioaddr,
+				dn200_usec2riwt(rx_intr->target_itr, priv), chan,
+				priv->hw);
+	}
+	/* In the condition of tx and rx share same irq(when use single msi),
+	 * will lead to interrupt lost when
+	 * process hardware interrupt by same cpu in dn200_interrupt,
+	 * the phenomenon is hw rx queue curr == tail index
+	 * (means hw consume all descs and sw don't refill)
+	 * however hw interrupt(tx & rx) status is kept in register,
+	 * so we call dn200_dma_interrupt again to process all queues' tx & rx interrupt
+	 */
+	if (!priv->plat->multi_msi_en)
+		dn200_dma_interrupt(priv);
+	return work_done;
+}
+
+static void tx_frame_count(struct dn200_priv *priv, u32 target_itr)
+{
+	if (target_itr < 17)
+		priv->xstats.tx_frames_16_below++;
+	else if (target_itr < 33)
+		priv->xstats.tx_frames_17_to_32++;
+	else if (target_itr < 65)
+		priv->xstats.tx_frames_33_to_64++;
+	else if (target_itr < 129)
+		priv->xstats.tx_frames_65_to_128++;
+	else
+		priv->xstats.tx_frames_129_to_256++;
+}
+
+static int dn200_napi_poll_tx(struct napi_struct *napi, int budget)
+{
+	struct dn200_channel *ch =
+	    container_of(napi, struct dn200_channel, tx_napi);
+	struct dn200_priv *priv = ch->priv_data;
+	struct dn200_itr_info *tx_intr;
+	u32 chan = ch->index;
+	int work_done;
+
+	priv->xstats.napi_poll++;
+	tx_intr = &priv->tx_intr[chan];
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+		 test_bit(DN200_PCIE_UNAVAILD, &priv->state))) {
+		napi_complete(napi);
+		return 0;
+	}
+
+	work_done = dn200_tx_clean(priv, budget, chan);
+	work_done = min(work_done, budget);
+
+	if (work_done < budget && napi_complete_done(napi, work_done)) {
+		if (tx_intr->packet) {
+			dn200_tx_itr_update(priv, tx_intr, chan);
+			tx_frame_count(priv, tx_intr->target_itr);
+			if (tx_intr->target_itr < tx_intr->current_itr) {
+				/* tx ITR needs to be reduced, this is highest priority */
+				tx_intr->current_itr = tx_intr->target_itr;
+				tx_intr->itr_countdown = ITR_COUNTDOWN_COUNT;
+				priv->tx_coal_frames[chan] =
+				    tx_intr->target_itr;
+			} else if (tx_intr->current_itr != tx_intr->target_itr) {
+				/* tx ITR needs to be increased, second priority */
+				tx_intr->current_itr = tx_intr->target_itr;
+				tx_intr->itr_countdown = ITR_COUNTDOWN_COUNT;
+				priv->tx_coal_frames[chan] =
+				    tx_intr->target_itr;
+			} else {
+				tx_intr->current_itr = tx_intr->target_itr;
+				/* No ITR update, lowest priority */
+				if (tx_intr->itr_countdown)
+					tx_intr->itr_countdown--;
+			}
+			if (!(tx_intr->itr_setting & DN200_ITR_DYNAMIC_ITR))
+				priv->tx_coal_frames[chan] =
+				    priv->tx_coal_frames_set[chan] ? : 1;
+		}
+		dn200_enable_tx_dma_irq(priv->ioaddr, ch->index, priv->hw);
+	}
+
+	/* In the condition of tx and rx share same irq(when use single msi),
+	 * will lead to interrupt lost when process hardware interrupt by
+	 * same cpu in dn200_interrupt,
+	 * the phenomenon is hw rx queue curr == tail index
+	 * (means hw consume all descs and sw don't refill)
+	 * however hw interrupt(tx & rx) status is kept in register,
+	 * so we call dn200_dma_interrupt again to
+	 * process all queues' tx & rx interrupt
+	 */
+	if (!priv->plat->multi_msi_en)
+		dn200_dma_interrupt(priv);
+
+	return work_done;
+}
+
+static int dn200_napi_poll_agg(struct napi_struct *napi, int budget)
+{
+	struct dn200_channel *ch =
+	    container_of(napi, struct dn200_channel, agg_napi);
+	struct dn200_priv *priv = ch->priv_data;
+	u32 chan = ch->index;
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[chan];
+	int work_done;
+	int rx_rcv;
+	bool complete_cleaned = true;
+	struct dn200_itr_info *rx_intr;
+	struct dn200_itr_info *tx_intr;
+
+	rx_intr = &priv->rx_intr[chan];
+	tx_intr = &priv->tx_intr[chan];
+	priv->xstats.napi_poll++;
+
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))) {
+		napi_complete(napi);
+		return 0;
+	}
+
+	rx_rcv = dn200_rx(priv, budget, chan, priv->dma_rx_size);
+	if (rx_rcv >= budget)
+		complete_cleaned = false;
+
+	work_done = dn200_tx_clean(priv, budget, chan);
+	if (work_done >= budget)
+		complete_cleaned = false;
+
+	if (!complete_cleaned)
+		return budget;
+
+	if (tx_intr->packet) {
+		dn200_tx_itr_update(priv, tx_intr, chan);
+		tx_frame_count(priv, tx_intr->target_itr);
+		if (tx_intr->current_itr != tx_intr->target_itr) {
+			tx_intr->current_itr = tx_intr->target_itr;
+			tx_intr->itr_countdown = ITR_COUNTDOWN_COUNT;
+		} else {
+			tx_intr->current_itr = tx_intr->target_itr;
+			/* No ITR update, lowest priority */
+			if (tx_intr->itr_countdown)
+				tx_intr->itr_countdown--;
+		}
+	}
+
+	if (rx_intr->packet) {
+		priv->dn200_update_ops->dn200_rx_itr_update(rx_intr, priv, chan);
+		if (rx_intr->current_itr != rx_intr->target_itr) {
+			/* Rx ITR needs to be increased, second priority */
+			rx_intr->current_itr = rx_intr->target_itr;
+			rx_intr->itr_countdown = ITR_COUNTDOWN_COUNT;
+		} else {
+			rx_intr->current_itr = rx_intr->target_itr;
+			/* No ITR update, lowest priority */
+			if (rx_intr->itr_countdown)
+				rx_intr->itr_countdown--;
+		}
+	}
+
+	if (!((rx_q->dma_rx + rx_q->cur_rx)->des3 & XGMAC_RDES3_OWN))
+		return budget;
+
+	if (napi_complete_done(napi, rx_rcv)) {
+		/* enable rx interrupt through set riwt */
+		dn200_rx_watchdog(priv, priv->ioaddr,
+					dn200_usec2riwt(rx_intr->target_itr, priv), chan,
+					priv->hw);
+		priv->tx_coal_frames[chan] = tx_intr->current_itr;
+		dn200_enable_tx_dma_irq(priv->ioaddr, ch->index, priv->hw);
+	}
+
+	return rx_rcv;
+}
+
+/**
+ *  dn200_tx_timeout
+ *  @dev : Pointer to net device structure
+ *  @txqueue: the index of the hanging transmit queue
+ *  Description: this function is called when a packet transmission fails to
+ *   complete within a reasonable time. The driver will mark the error in the
+ *   netdev structure and arrange for the device to be reset to a sane state
+ *   in order to transmit a new packet.
+ */
+static void dn200_tx_timeout(struct net_device *dev, unsigned int txqueue)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue, hw_chan;
+	u32 cache_lvl;
+	u32 ch_tail, ch_curr, tx_avail;
+	u32 dbg_status, dbg_status0, dbg_status1;
+
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(dev, "%s :%s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return;
+	}
+
+	netdev_info(dev, "=== Tx data path debug info ===\n");
+
+	for (queue = 0; queue < tx_count; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+		u32 tobe_consume = 0;
+
+		hw_chan = queue + DN200_RXQ_START_GET(priv->hw);
+
+		/* get sw & hw ring status, include sw available descs number,
+		 * hw to be consume descs number
+		 */
+		ch_tail = readl(priv->ioaddr +
+				XGMAC_DMA_CH_TxDESC_TAIL_LPTR(hw_chan));
+		ch_curr = readl(priv->ioaddr +
+				XGMAC_DMA_CH_TxDESC_CURR_LPTR(hw_chan));
+		tx_avail = dn200_tx_avail(priv, queue);
+		netdev_info(dev, "TX queue %d time_running %d task_status %d time_status %d\n",
+				queue, atomic_read(&tx_q->txtimer_running),
+				tx_q->task_need_sch, tx_q->txtimer_need_sch);
+		netdev_info(dev, "TX Queue %d channel %d %s\n", queue, hw_chan,
+			   ((tx_avail < priv->dma_tx_size - 1) ||
+			    (ch_tail != ch_curr)) ?
+				   "- running" :
+				   "");
+
+		if (ch_tail != ch_curr)
+			tobe_consume = (ch_curr > ch_tail) ?
+					       (priv->dma_tx_size -
+						((ch_curr - ch_tail) / 16)) :
+					       ((ch_tail - ch_curr) / 16 + 1);
+
+		netdev_info(dev,
+			   "Desc sw ring curr_tx:%d, dirty_tx:%d, tx available:%d, result:%s\n",
+			   tx_q->cur_tx, tx_q->dirty_tx, tx_avail,
+			   tx_avail >
+			   DN200_TX_THRESH(priv) ? "yes (good)" :
+			   "no (abnormal)");
+
+		netdev_info(dev,
+			   "Desc hw ring start:%#x, tail:%#x, curr:%#x, to consume:%u, result:%s\n",
+			   readl(priv->ioaddr +
+				 XGMAC_DMA_CH_TxDESC_LADDR(hw_chan)), ch_tail,
+			   ch_curr, tobe_consume,
+			   tobe_consume >
+			   (priv->dma_tx_size /
+			    2) ? "no (abnormal)" : "yes (good)");
+
+		/*2. get descs cache level */
+		cache_lvl =
+		    readl(priv->ioaddr + XGMAC_CH_DESC_CACHE_LVL(hw_chan));
+		netdev_info(dev,
+			   "[reg 3168]Desc tx cache levle is:%lu, cache_lvl:%#x\n",
+			   (cache_lvl & XGMAC_TXLVL), cache_lvl);
+
+		/*3. get dma channel debug status */
+		dbg_status = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(hw_chan));
+		netdev_info(dev,
+			   "[reg 3164]DMA channel TDWS-des write state:%#lx, TDTS-data transfer state:%#lx, TDFS-des fetch state:%#lx, TDRS:%#lx, TDXS:%#lx, dbg_status:%#x\n",
+			   (dbg_status & XGMAC_TDWS) >> XGMAC_TDWS_SHIFT,
+			   (dbg_status & XGMAC_TDTS) >> XGMAC_TDTS_SHIFT,
+			   (dbg_status & XGMAC_TDFS) >> XGMAC_TDFS_SHIFT,
+			   (dbg_status & XGMAC_TDRS) >> XGMAC_TDRS_SHIFT,
+			   (dbg_status & XGMAC_TDXS) >> XGMAC_TDXS_SHIFT,
+			   dbg_status);
+
+		/*4. get tx dma FSM debug status1(dma_debug_status1) */
+		dbg_status1 = readl(priv->ioaddr + XGMAC_DEBUG_ST1);
+		netdev_info(dev,
+			   "[reg 3024]Chanel %d DMA FSMs are%s actively processing the descriptors or packet data, dma_debug_status1:%#x.\n",
+			   hw_chan,
+			   (dbg_status1 & (1 << hw_chan)) ? "" : " Not",
+			   dbg_status1);
+
+		/*5. get tx dma debug status0(dma_debug_status0) */
+		dbg_status0 = readl(priv->ioaddr + XGMAC_DEBUG_ST0);
+		netdev_info(dev,
+			   "[reg 3020]AXI Master Read Channel is%s active(tx global status), dma_debug_status0:%#x.\n",
+			   (dbg_status0 & XGMAC_AXRHSTS) ? "" : " Not",
+			   dbg_status0);
+	}
+
+	dn200_global_err(priv, DN200_TX_TIMEOUT);
+}
+
+static void dn200_fdirs_reconfig(struct dn200_priv *priv, bool enable)
+{
+	struct dn200_fdir_filter *input;
+	int i = priv->flow_entries_max - 5;
+
+	if (HW_IS_VF(priv->hw))
+		return;
+	/* report total rule count */
+	for (; i >= 0; i--) {
+		input = &priv->fdir_enties[i];
+		if (input->enable) {
+			dn200_config_ntuple_filter(priv, priv->hw, i, input,
+						   enable);
+		}
+	}
+}
+
+static void _dn200_set_filter(struct dn200_priv *priv, struct net_device *dev)
+{
+	u8 wakeup_wq = false;
+
+	dn200_set_filter(priv, priv->hw, dev, &wakeup_wq);
+	priv->pf_rxp_set |= RXP_SET_FIL;
+	if (wakeup_wq && PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+		queue_work(priv->wq, &priv->rxp_task);
+	else if (PRIV_SRIOV_SUPPORT(priv))
+		set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+
+	if (wakeup_wq && PRIV_IS_VF(priv)) {
+		netdev_dbg(priv->dev, "%s %d notify pf set filter\n", __func__, __LINE__);
+		DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, RXP_TASK, 1);
+		irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+	}
+}
+/**
+ *  dn200_set_rx_mode - entry point for multicast addressing
+ *  @dev : pointer to the device structure
+ *  Description:
+ *  This function is a driver entry point which gets called by the kernel
+ *  whenever multicast addresses must be enabled/disabled.
+ *  Return value:
+ *  void.
+ */
+static void dn200_set_rx_mode(struct net_device *dev)
+{
+	u32 chan;
+
+	struct dn200_priv *priv = netdev_priv(dev);
+	netdev_features_t features = dev->features;
+
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(dev, "%s :%s\n", __func__, DN200_PCIE_BAR_ERR);
+		return;
+	}
+
+	if (!test_bit(DN200_DEV_INIT, &priv->state)) {
+		_dn200_set_filter(priv, dev);
+		if (!!(features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
+				 !(dev->flags & IFF_PROMISC))
+			_dn200_config_vlan_rx_fltr(priv, priv->hw, true);
+		else
+			_dn200_config_vlan_rx_fltr(priv, priv->hw, false);
+	}
+	if (!!(features & NETIF_F_HW_VLAN_CTAG_RX))
+		dn200_rx_vlan_stripping_config(priv, priv->hw, true);
+	else
+		dn200_rx_vlan_stripping_config(priv, priv->hw, false);
+
+	if (HW_IS_PUREPF(priv->hw)) {
+		if (dev->features & NETIF_F_RXALL)
+			priv->rec_all = true;
+		else
+			priv->rec_all = false;
+		for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+			dn200_dma_rx_all_set(priv, priv->ioaddr, chan,
+					     priv->rec_all, priv->hw);
+		}
+
+		if (!PRIV_IS_VF(priv)) {
+			dn200_dma_rx_all_set(priv, priv->ioaddr,
+					     DN200_LAST_QUEUE(priv),
+					     priv->rec_all, priv->hw);
+		}
+	}
+	if (dev->features & NETIF_F_NTUPLE) {
+		dn200_l3_l4_filter_config(priv, priv->hw,
+					!!(dev->features & NETIF_F_NTUPLE));
+		dn200_fdirs_reconfig(priv, !!(dev->features & NETIF_F_NTUPLE));
+	}
+}
+
+/**
+ *  dn200_change_mtu - entry point to change MTU size for the device.
+ *  @dev : device pointer.
+ *  @new_mtu : the new MTU size for the device.
+ *  Description: the Maximum Transfer Unit (MTU) is used by the network layer
+ *  to drive packet transmission. Ethernet has an MTU of 1500 octets
+ *  (ETH_DATA_LEN). This value can be changed with ifconfig.
+ *  Return value:
+ *  0 on success and an appropriate (-)ve integer as defined in errno.h
+ *  file on failure.
+ */
+static int dn200_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	const int mtu = new_mtu;
+	int min_mtu = 0, ret = 0;
+
+	if (new_mtu == (int)dev->mtu) {
+		netdev_warn(dev, "MTU is already %u\n", dev->mtu);
+		return 0;
+	}
+
+	if (dn200_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) {
+		netdev_dbg(priv->dev, "Jumbo frames not supported for XDP\n");
+		return -EINVAL;
+	}
+	ret = dn200_max_mtu_get(priv, &priv->plat->maxmtu, &min_mtu);
+	if (ret < 0) {
+		dev_err(priv->device, "max_mtu alloc err!\n");
+		return -EINVAL;
+	}
+
+	/* If condition true, FIFO is too small or MTU too large */
+	if (new_mtu > priv->plat->maxmtu)
+		return -EINVAL;
+
+	if (new_mtu < min_mtu)
+		return -EINVAL;
+
+	netdev_info(priv->dev, "changing MTU from %d to %d\n", dev->mtu, mtu);
+	dev->mtu = mtu;
+	if (!priv->mii)
+		netdev_update_features(dev);
+	/* base on new mtu to update rx interrupt usec */
+	dn200_rx_itr_usec_update(priv);
+	usleep_range(10000, 15000);
+	return 0;
+}
+
+static netdev_features_t dn200_fix_features(struct net_device *dev,
+					    netdev_features_t features)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (priv->plat->rx_coe == DN200_RX_COE_NONE)
+		features &= ~NETIF_F_RXCSUM;
+
+	if (!priv->plat->tx_coe)
+		features &= ~NETIF_F_CSUM_MASK;
+
+	/* Some GMAC devices have a bugged Jumbo frame support that
+	 * needs to have the Tx COE disabled for oversized frames
+	 * (due to limited buffer sizes). In this case we disable
+	 * the TX csum insertion in the TDES and not use SF.
+	 */
+	if (priv->plat->bugged_jumbo && dev->mtu > ETH_DATA_LEN)
+		features &= ~NETIF_F_CSUM_MASK;
+
+	/* Disable tso if asked by ethtool */
+	if (priv->plat->tso_en && priv->dma_cap.tsoen) {
+		if (features & NETIF_F_TSO)
+			priv->tso = true;
+		else
+			priv->tso = false;
+	}
+
+	return features;
+}
+
+static int dn200_set_features(struct net_device *netdev,
+			      netdev_features_t features)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	netdev_features_t changed = netdev->features ^ features;
+
+	/* Keep the COE Type in case of csum is supporting */
+	if (features & NETIF_F_RXCSUM)
+		priv->hw->rx_csum = priv->plat->rx_coe;
+	else
+		priv->hw->rx_csum = 0;
+	/* No check needed because rx_coe has been set before and it will be
+	 * fixed in case of issue.
+	 */
+	dn200_rx_ipc(priv, priv->hw);
+
+	if (priv->sph_cap) {
+		bool sph_en = (priv->hw->rx_csum > 0) && priv->sph;
+		u32 chan;
+
+		for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++)
+			dn200_enable_sph(priv, priv->ioaddr, sph_en, chan,
+					 priv->hw);
+	}
+	netdev->features = features;
+	if (changed & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER))
+		dn200_set_rx_mode(netdev);
+
+	if (changed & NETIF_F_LOOPBACK)
+		dn200_set_mac_loopback(priv, priv->ioaddr,
+				       !!(features & NETIF_F_LOOPBACK));
+
+	if (changed & NETIF_F_RXHASH) {
+		if (features & NETIF_F_RXHASH)
+			priv->rss.enable = true;
+		else
+			priv->rss.enable = false;
+
+		dn200_rss_configure(priv, priv->hw, &priv->rss,
+				    priv->plat->rx_queues_to_use);
+	}
+	if (changed & NETIF_F_NTUPLE) {
+		dn200_l3_l4_filter_config(priv, priv->hw,
+					  !!(netdev->features & NETIF_F_NTUPLE));
+		dn200_fdirs_reconfig(priv,
+				     !!(netdev->features & NETIF_F_NTUPLE));
+	}
+	return 0;
+}
+
+static void dn200_common_interrupt(struct dn200_priv *priv)
+{
+	u32 rx_cnt = priv->plat->rx_queues_to_use;
+	u32 tx_cnt = priv->plat->tx_queues_to_use;
+	u32 queues_count;
+	u32 queue;
+	bool xmac;
+
+	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+	queues_count = (rx_cnt > tx_cnt) ? rx_cnt : tx_cnt;
+
+	if (priv->dma_cap.estsel)
+		dn200_est_irq_status(priv, priv->ioaddr, priv->dev,
+				     &priv->xstats, tx_cnt);
+	/* To handle GMAC own interrupts */
+	if (priv->plat->has_gmac || xmac) {
+		int status =
+		    dn200_host_irq_status(priv, priv->hw, &priv->xstats);
+
+		if (unlikely(status)) {
+			/* For LPI we need to save the tx status */
+			if (status & CORE_IRQ_TX_PATH_IN_LPI_MODE)
+				priv->tx_path_in_lpi_mode = true;
+			if (status & CORE_IRQ_TX_PATH_EXIT_LPI_MODE)
+				priv->tx_path_in_lpi_mode = false;
+		}
+
+		for (queue = 0; queue < queues_count; queue++)
+			status = dn200_host_mtl_irq_status(priv, priv->hw,
+							   queue);
+
+		dn200_timestamp_interrupt(priv, priv);
+	}
+}
+
+/**
+ *  dn200_interrupt - main ISR
+ *  @irq: interrupt number.
+ *  @dev_id: to pass the net device pointer.
+ *  Description: this is the main driver interrupt service routine.
+ *  It can call:
+ *  o DMA service routine (to manage incoming frame reception and transmission
+ *    status)
+ *  o Core interrupts to manage: remote wake-up, management counter, LPI
+ *    interrupts.
+ */
+static irqreturn_t dn200_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	/* Check if adapter is up */
+	if (unlikely(test_bit(DN200_DOWN, &priv->state)))
+		return IRQ_HANDLED;
+
+	/* Check if a fatal error happened */
+	if (dn200_safety_feat_interrupt(priv))
+		return IRQ_HANDLED;
+
+	/* To handle Common interrupts */
+	dn200_common_interrupt(priv);
+
+	/* To handle DMA interrupts */
+	dn200_dma_interrupt(priv);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_mac_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (unlikely(!dev)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(DN200_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	/* To handle Common interrupts */
+	dn200_common_interrupt(priv);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_safety_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (unlikely(!dev)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(DN200_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	/* Check if a fatal error happened */
+	dn200_safety_feat_interrupt(priv);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_msi_intr_tx(int irq, void *data)
+{
+	struct dn200_tx_queue *tx_q = (struct dn200_tx_queue *)data;
+	int chan = tx_q->queue_index;
+	struct dn200_priv *priv;
+	int status;
+
+	priv = container_of(tx_q, struct dn200_priv, tx_queue[chan]);
+	priv->xstats.tx_normal_irq_n++;
+
+	if (unlikely(!data)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+	/* Check if adapter is up */
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state)))
+		return IRQ_HANDLED;
+	if (unlikely(!dn200_dp_hwif_id_check(priv->ioaddr))) {
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		return IRQ_HANDLED;
+	}
+
+	status = dn200_napi_check(priv, chan, DMA_DIR_TX);
+
+	if (unlikely(status == tx_hard_error))
+		dn200_tx_err(priv, chan);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_msi_intr_rx(int irq, void *data)
+{
+	struct dn200_rx_queue *rx_q = (struct dn200_rx_queue *)data;
+	int chan = rx_q->queue_index;
+	struct dn200_priv *priv;
+
+	priv = container_of(rx_q, struct dn200_priv, rx_queue[chan]);
+	priv->xstats.rx_normal_irq_n++;
+
+	if (unlikely(!data)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+	/* Check if adapter is up */
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state)))
+		return IRQ_HANDLED;
+	if (unlikely(!dn200_dp_hwif_id_check(priv->ioaddr))) {
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		return IRQ_HANDLED;
+	}
+	dn200_napi_check(priv, chan, DMA_DIR_RX);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dn200_msi_intr_rxtx(int irq, void *data)
+{
+	struct dn200_channel *ch = (struct dn200_channel *)data;
+	struct dn200_priv *priv;
+	struct napi_struct *agg_napi;
+	struct dn200_itr_info *rx_intr;
+	struct dn200_itr_info *tx_intr;
+
+	if (unlikely(!data))
+		return IRQ_NONE;
+
+	priv = ch->priv_data;
+	rx_intr = &priv->rx_intr[ch->index];
+	tx_intr = &priv->tx_intr[ch->index];
+
+	if (irq == priv->tx_irq[ch->index])
+		priv->xstats.tx_normal_irq_n++;
+	if (irq == priv->rx_irq[ch->index])
+		priv->xstats.rx_normal_irq_n++;
+
+	/* Check if adapter is up */
+	if (unlikely(test_bit(DN200_DOWN, &priv->state) ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state)))
+		return IRQ_HANDLED;
+	if (!dn200_dp_hwif_id_check(priv->ioaddr)) {
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		return IRQ_HANDLED;
+	}
+	agg_napi = &ch->agg_napi;
+	if (napi_schedule_prep(agg_napi)) {
+		if (rx_intr->itr_setting & DN200_ITR_DYNAMIC_ITR &&
+			rx_intr->current_itr > DN200_ITR_RWT_BOUND)
+			dn200_rx_watchdog(priv, priv->ioaddr,
+					 DN200_ITR_MAX_RWT, ch->index, priv->hw);
+
+		dn200_disable_tx_dma_irq(priv->ioaddr, ch->index, priv->hw);
+		__napi_schedule(agg_napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* Polling receive - used by NETCONSOLE and other diagnostic tools
+ * to allow network I/O with interrupts disabled.
+ */
+static void dn200_poll_controller(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int i;
+
+	/* If adapter is down, do nothing */
+	if (test_bit(DN200_DOWN, &priv->state))
+		return;
+
+	if (priv->plat->multi_msi_en) {
+		for (i = 0; i < priv->plat->rx_queues_to_use; i++)
+			dn200_msi_intr_rx(0, &priv->rx_queue[i]);
+
+		for (i = 0; i < priv->plat->tx_queues_to_use; i++)
+			dn200_msi_intr_tx(0, &priv->tx_queue[i]);
+	} else {
+		disable_irq(dev->irq);
+		dn200_interrupt(dev->irq, dev);
+		enable_irq(dev->irq);
+	}
+}
+
+static int dn200_mii_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct mii_ioctl_data *data = if_mii(rq);
+	int ret;
+
+	if (!priv->mii)
+		return -EOPNOTSUPP;
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		data->phy_id = priv->plat->phy_addr;
+		break;
+	case SIOCGMIIREG:
+		ret = priv->mii->read(priv->mii, priv->plat->phy_addr,
+			data->reg_num & 0x1F);
+		if (ret < 0)
+			return ret;
+		data->val_out = ret;
+		break;
+	case SIOCSMIIREG:
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+/**
+ *  dn200_ioctl - Entry point for the Ioctl
+ *  @dev: Device pointer.
+ *  @rq: An IOCTL specefic structure, that can contain a pointer to
+ *  a proprietary structure used to pass information to the driver.
+ *  @cmd: IOCTL command
+ *  Description:
+ *  Currently it supports the phy_mii_ioctl(...) and HW time stamping.
+ */
+static int dn200_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	int ret = -EOPNOTSUPP;
+
+	if (!netif_running(dev))
+		return -EINVAL;
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		return dn200_mii_ioctl(dev, rq, cmd);
+	case SIOCSHWTSTAMP:
+		ret = dn200_hwtstamp_set(dev, rq);
+		break;
+	case SIOCGHWTSTAMP:
+		ret = dn200_hwtstamp_get(dev, rq);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static LIST_HEAD(dn200_block_cb_list);
+static u16 dn200_skb_tx_hash(struct net_device *dev,
+			     const struct sk_buff *skb, u16 num_tx_queues)
+{
+	u32 jhash_initval_salt = 0xd631614b;
+	u32 hash;
+
+	if (skb->sk && skb->sk->sk_hash)
+		hash = skb->sk->sk_hash;
+	else
+		hash = (__force u16)skb->protocol ^ skb->hash;
+
+	hash = jhash_1word(hash, jhash_initval_salt);
+
+	return (u16)(((u64)hash * num_tx_queues) >> 32);
+}
+
+
+static u16 dn200_select_queue(struct net_device *dev, struct sk_buff *skb,
+			      struct net_device *sb_dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int gso = skb_shinfo(skb)->gso_type;
+	int dscp = 0;
+	/* for MTU bigger than 1500 and packet length bigger than 1518,
+	 * always use queue 0 to xmit, as just queue 0 support jumbo
+	 */
+	if (dev->mtu > 1500 && skb->len > 1518)
+		return 0;
+	if (!netdev_get_num_tc(dev) || !priv->ets)
+		goto kernel_pick;
+
+	/* DSCP mode or PCP mode? */
+	if (priv->dscp_app_cnt) {
+		if (skb->protocol == htons(ETH_P_IP))
+			dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+
+		if (dscp < 64)
+			return priv->ets->prio_tc[priv->dscp2up[dscp]];
+	} else if (skb_vlan_tag_present(skb)) {
+		return priv->ets->prio_tc[skb_vlan_tag_get_prio(skb)];
+	}
+
+kernel_pick:
+	if (gso & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))
+		return 0;
+
+	if ((skb->sk && skb->sk->sk_hash) || skb->hash)
+		return dn200_skb_tx_hash(dev, skb,
+				 dev->real_num_tx_queues) %
+	    dev->real_num_tx_queues;
+	else
+		return netdev_pick_tx(dev, skb, sb_dev) % dev->real_num_tx_queues;
+}
+
+static int dn200_set_mac_address(struct net_device *ndev, void *addr)
+{
+	struct dn200_priv *priv = netdev_priv(ndev);
+	struct sockaddr *paddr = addr;
+	int ret = 0;
+
+	if (!is_valid_ether_addr(paddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	eth_hw_addr_set(ndev, paddr->sa_data);
+	/* netdev set mac before call dev open, down or opening state don't allow to set rpx */
+	if (netif_running(ndev) &&
+		!test_bit(DN200_DOWN, &priv->state) &&
+		!test_bit(DN200_DEV_INIT, &priv->state)) {
+		ret = _dn200_set_umac_addr(priv, (unsigned char *)ndev->dev_addr, 0);
+	}
+
+	/* when ret bigger than 0, means set umac filter success,
+	 * but need to return 0 to dev layer, otherwise dev layer will
+	 * not call notifier chain to flush dev arp entries and cause stop flow a moment
+	 */
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static bool dn200_netdev_is_memb_of_bond(struct net_device *netdev)
+{
+	struct net_device *upper_dev = NULL;
+
+	upper_dev = netdev_master_upper_dev_get(netdev);
+	if (upper_dev && netif_is_bond_master(upper_dev))
+		return true;
+
+	return false;
+}
+
+int dn200_dev_event(struct notifier_block *unused,
+		    unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (dev->netdev_ops != &dn200_netdev_ops &&
+			dev->netdev_ops != &dn200_vf_netdev_ops)
+		goto done;
+
+	switch (event) {
+	case NETDEV_CHANGENAME:
+		if (priv->dbgfs_dir)
+			priv->dbgfs_dir =
+			    debugfs_rename(priv->dbgfs_dir->d_parent,
+					   priv->dbgfs_dir,
+					   priv->dbgfs_dir->d_parent,
+					   dev->name);
+		break;
+	case NETDEV_CHANGEUPPER:
+		if (dev == priv->dev) {
+			/* add slave netdev(bonding member) */
+			if (dn200_netdev_is_memb_of_bond(dev))
+				set_bit(DN200_IS_BONDING, &priv->state);
+			else
+				/* remove slave netdev(bonding member) */
+				clear_bit(DN200_IS_BONDING, &priv->state);
+		}
+
+	default:
+		break;
+	}
+done:
+	return NOTIFY_DONE;
+}
+
+/* Use network device events to rename debugfs file entries.
+ */
+#define DN200_DEFINE_SHOW_ATTRIBUTE(__name)		\
+	static int __name##_open(struct inode *inode, struct file *file)	\
+	{																	\
+		return single_open(file, __name##_show, inode->i_private);		\
+	}																	\
+																		\
+	static const struct file_operations __name##_fops = {				\
+		.owner = THIS_MODULE,											\
+		.open = __name##_open,											\
+		.read = seq_read,												\
+		.llseek = seq_lseek,											\
+		.release = single_release,										\
+	}
+
+static void sysfs_display_ring(void *head, int size, int extend_desc,
+			       struct seq_file *seq, dma_addr_t dma_phy_addr)
+{
+	int i;
+	struct dma_desc *p = (struct dma_desc *)head;
+	dma_addr_t dma_addr;
+
+	for (i = 0; i < size; i++) {
+		dma_addr = dma_phy_addr + i * sizeof(*p);
+		seq_printf(seq, "%d [%pad]: 0x%x 0x%x 0x%x 0x%x\n",
+			   i, &dma_addr,
+			   le32_to_cpu(p->des0), le32_to_cpu(p->des1),
+			   le32_to_cpu(p->des2), le32_to_cpu(p->des3));
+		p++;
+		seq_puts(seq, "\n");
+	}
+}
+
+static int dn200_rings_status_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+
+	if ((dev->flags & IFF_UP) == 0)
+		return 0;
+
+	for (queue = 0; queue < rx_count; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		seq_printf(seq, "RX Queue %d:\n", queue);
+
+		seq_puts(seq, "Descriptor ring:\n");
+		sysfs_display_ring((void *)rx_q->dma_rx,
+				   priv->dma_rx_size, 0, seq, rx_q->dma_rx_phy);
+	}
+
+	for (queue = 0; queue < tx_count; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+
+		seq_printf(seq, "TX Queue %d:\n", queue);
+		seq_puts(seq, "Descriptor ring:\n");
+		sysfs_display_ring((void *)tx_q->dma_tx,
+				   priv->dma_tx_size, 0, seq, tx_q->dma_tx_phy);
+	}
+
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_rings_status);
+
+static int dn200_dma_cap_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (!priv->hw_cap_support) {
+		seq_puts(seq, "DMA HW features not supported\n");
+		return 0;
+	}
+
+	seq_puts(seq, "==============================\n");
+	seq_puts(seq, "\tDMA HW features\n");
+	seq_puts(seq, "==============================\n");
+
+	seq_printf(seq, "\t10/100 Mbps: %s\n",
+		   (priv->dma_cap.mbps_10_100) ? "Y" : "N");
+	seq_printf(seq, "\t1000 Mbps: %s\n",
+		   (priv->dma_cap.mbps_1000) ? "Y" : "N");
+	seq_printf(seq, "\tHalf duplex: %s\n",
+		   (priv->dma_cap.half_duplex) ? "Y" : "N");
+	seq_printf(seq, "\tHash Filter: %s\n",
+		   (priv->dma_cap.hash_filter) ? "Y" : "N");
+	seq_printf(seq, "\tMultiple MAC address registers: %s\n",
+		   (priv->dma_cap.multi_addr) ? "Y" : "N");
+	seq_printf(seq, "\tPCS (TBI/SGMII/RTBI PHY interfaces): %s\n",
+		   (priv->dma_cap.pcs) ? "Y" : "N");
+	seq_printf(seq, "\tSMA (MDIO) Interface: %s\n",
+		   (priv->dma_cap.sma_mdio) ? "Y" : "N");
+	seq_printf(seq, "\tPMT Remote wake up: %s\n",
+		   (priv->dma_cap.pmt_remote_wake_up) ? "Y" : "N");
+	seq_printf(seq, "\tPMT Magic Frame: %s\n",
+		   (priv->dma_cap.pmt_magic_frame) ? "Y" : "N");
+	seq_printf(seq, "\tRMON module: %s\n",
+		   (priv->dma_cap.rmon) ? "Y" : "N");
+	seq_printf(seq, "\tIEEE 1588-2002 Time Stamp: %s\n",
+		   (priv->dma_cap.time_stamp) ? "Y" : "N");
+	seq_printf(seq, "\tIEEE 1588-2008 Advanced Time Stamp: %s\n",
+		   (priv->dma_cap.atime_stamp) ? "Y" : "N");
+	seq_printf(seq, "\t802.3az - Energy-Efficient Ethernet (EEE): %s\n",
+		   (priv->dma_cap.eee) ? "Y" : "N");
+	seq_printf(seq, "\tAV features: %s\n", (priv->dma_cap.av) ? "Y" : "N");
+	seq_printf(seq, "\tChecksum Offload in TX: %s\n",
+		   (priv->dma_cap.tx_coe) ? "Y" : "N");
+	if (priv->chip_id >= DWMAC_CORE_4_00) {
+		seq_printf(seq, "\tIP Checksum Offload in RX: %s\n",
+			   (priv->dma_cap.rx_coe) ? "Y" : "N");
+	} else {
+		seq_printf(seq, "\tIP Checksum Offload (type1) in RX: %s\n",
+			   (priv->dma_cap.rx_coe_type1) ? "Y" : "N");
+		seq_printf(seq, "\tIP Checksum Offload (type2) in RX: %s\n",
+			   (priv->dma_cap.rx_coe_type2) ? "Y" : "N");
+	}
+	seq_printf(seq, "\tRXFIFO > 2048bytes: %s\n",
+		   (priv->dma_cap.rxfifo_over_2048) ? "Y" : "N");
+	seq_printf(seq, "\tNumber of Additional RX channel: %d\n",
+		   priv->dma_cap.number_rx_channel);
+	seq_printf(seq, "\tNumber of Additional TX channel: %d\n",
+		   priv->dma_cap.number_tx_channel);
+	seq_printf(seq, "\tNumber of Additional RX queues: %d\n",
+		   priv->dma_cap.number_rx_queues);
+	seq_printf(seq, "\tNumber of Additional TX queues: %d\n",
+		   priv->dma_cap.number_tx_queues);
+	seq_printf(seq, "\tEnhanced descriptors: %s\n",
+		   (priv->dma_cap.enh_desc) ? "Y" : "N");
+	seq_printf(seq, "\tTX Fifo Size: %d\n", priv->dma_cap.tx_fifo_size);
+	seq_printf(seq, "\tRX Fifo Size: %d\n", priv->dma_cap.rx_fifo_size);
+	seq_printf(seq, "\tHash Table Size: %d\n", priv->dma_cap.hash_tb_sz);
+	seq_printf(seq, "\tTSO: %s\n", priv->dma_cap.tsoen ? "Y" : "N");
+	seq_printf(seq, "\tNumber of PPS Outputs: %d\n",
+		   priv->dma_cap.pps_out_num);
+	seq_printf(seq, "\tSafety Features: %s\n",
+		   priv->dma_cap.asp ? "Y" : "N");
+	seq_printf(seq, "\tFlexible RX Parser: %s\n",
+		   priv->dma_cap.frpsel ? "Y" : "N");
+	seq_printf(seq, "\tEnhanced Addressing: %d\n", priv->dma_cap.addr64);
+	seq_printf(seq, "\tReceive Side Scaling: %s\n",
+		   priv->dma_cap.rssen ? "Y" : "N");
+	seq_printf(seq, "\tVLAN Hash Filtering: %s\n",
+		   priv->dma_cap.vlhash ? "Y" : "N");
+	seq_printf(seq, "\tSplit Header: %s\n",
+		   priv->dma_cap.sphen ? "Y" : "N");
+	seq_printf(seq, "\tVLAN TX Insertion: %s\n",
+		   priv->dma_cap.vlins ? "Y" : "N");
+	seq_printf(seq, "\tDouble VLAN: %s\n", priv->dma_cap.dvlan ? "Y" : "N");
+	seq_printf(seq, "\tNumber of L3/L4 Filters: %d\n",
+		   priv->dma_cap.l3l4fnum);
+	seq_printf(seq, "\tARP Offloading: %s\n",
+		   priv->dma_cap.arpoffsel ? "Y" : "N");
+	seq_printf(seq, "\tEnhancements to Scheduled Traffic (EST): %s\n",
+		   priv->dma_cap.estsel ? "Y" : "N");
+	seq_printf(seq, "\tFrame Preemption (FPE): %s\n",
+		   priv->dma_cap.fpesel ? "Y" : "N");
+	seq_printf(seq, "\tTime-Based Scheduling (TBS): %s\n",
+		   priv->dma_cap.tbssel ? "Y" : "N");
+	seq_printf(seq, "\tIs Bonding Member: %s\n",
+		   test_bit(DN200_IS_BONDING, &priv->state) ? "Y" : "N");
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_dma_cap);
+
+static int dn200_tx_diag_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue, hw_chan;
+	u32 cache_lvl;
+	u32 ch_tail, ch_curr, tx_avail;
+	u32 dbg_status, dbg_status0, dbg_status1;
+
+	if ((dev->flags & IFF_UP) == 0)
+		return 0;
+
+	seq_puts(seq, "=== Tx data path debug info ===\n");
+
+	for (queue = 0; queue < tx_count; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+		u32 tobe_consume = 0;
+
+		hw_chan = queue + DN200_RXQ_START_GET(priv->hw);
+
+		/* get sw & hw ring status, include sw available descs number,
+		 * hw to be consume descs number
+		 */
+		ch_tail = readl(priv->ioaddr +
+				XGMAC_DMA_CH_TxDESC_TAIL_LPTR(hw_chan));
+		ch_curr = readl(priv->ioaddr +
+				XGMAC_DMA_CH_TxDESC_CURR_LPTR(hw_chan));
+		tx_avail = dn200_tx_avail(priv, queue);
+
+		seq_printf(seq, "TX Queue %d channel %d %s\n", queue, hw_chan,
+			   ((tx_avail < priv->dma_tx_size - 1) ||
+			    (ch_tail != ch_curr)) ?
+				   "- running" :
+				   "");
+
+		if (ch_tail != ch_curr)
+			tobe_consume = (ch_curr > ch_tail) ?
+					       (priv->dma_tx_size -
+						((ch_curr - ch_tail) / 16)) :
+					       ((ch_tail - ch_curr) / 16 + 1);
+
+		seq_printf(seq,
+			   "Desc sw ring curr_tx:%d, dirty_tx:%d, tx available:%d, result:%s\n",
+			   tx_q->cur_tx, tx_q->dirty_tx, tx_avail,
+			   tx_avail >
+			   DN200_TX_THRESH(priv) ? "yes (good)" :
+			   "no (abnormal)");
+
+		seq_printf(seq,
+			   "Desc hw ring start:%#x, tail:%#x, curr:%#x, to consume:%u, result:%s\n",
+			   readl(priv->ioaddr +
+				 XGMAC_DMA_CH_TxDESC_LADDR(hw_chan)), ch_tail,
+			   ch_curr, tobe_consume,
+			   tobe_consume >
+			   (priv->dma_tx_size /
+			    2) ? "no (abnormal)" : "yes (good)");
+
+		/*2. get descs cache level */
+		cache_lvl =
+		    readl(priv->ioaddr + XGMAC_CH_DESC_CACHE_LVL(hw_chan));
+		seq_printf(seq,
+			   "[reg 3168]Desc tx cache levle is:%lu, cache_lvl:%#x\n",
+			   (cache_lvl & XGMAC_TXLVL), cache_lvl);
+
+		/*3. get dma channel debug status */
+		dbg_status = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(hw_chan));
+		seq_printf(seq,
+			   "[reg 3164]DMA channel TDWS-des write state:%#lx, TDTS-data transfer state:%#lx, TDFS-des fetch state:%#lx, TDRS:%#lx, TDXS:%#lx, dbg_status:%#x\n",
+			   (dbg_status & XGMAC_TDWS) >> XGMAC_TDWS_SHIFT,
+			   (dbg_status & XGMAC_TDTS) >> XGMAC_TDTS_SHIFT,
+			   (dbg_status & XGMAC_TDFS) >> XGMAC_TDFS_SHIFT,
+			   (dbg_status & XGMAC_TDRS) >> XGMAC_TDRS_SHIFT,
+			   (dbg_status & XGMAC_TDXS) >> XGMAC_TDXS_SHIFT,
+			   dbg_status);
+
+		/*4. get tx dma FSM debug status1(dma_debug_status1) */
+		dbg_status1 = readl(priv->ioaddr + XGMAC_DEBUG_ST1);
+		seq_printf(seq,
+			   "[reg 3024]Chanel %d DMA FSMs are%s actively processing the descriptors or packet data, dma_debug_status1:%#x.\n",
+			   hw_chan,
+			   (dbg_status1 & (1 << hw_chan)) ? "" : " Not",
+			   dbg_status1);
+
+		/*5. get tx dma debug status0(dma_debug_status0) */
+		dbg_status0 = readl(priv->ioaddr + XGMAC_DEBUG_ST0);
+		seq_printf(seq,
+			   "[reg 3020]AXI Master Read Channel is%s active(tx global status), dma_debug_status0:%#x.\n",
+			   (dbg_status0 & XGMAC_AXRHSTS) ? "" : " Not",
+			   dbg_status0);
+	}
+
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_tx_diag);
+
+static int dn200_rx_diag_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 queue, hw_chan;
+	u32 cache_lvl;
+	u32 mtl_rxq_dbg = 0;
+	u32 mac_dbg = 0;
+	u32 ch_tail, ch_curr, avail, tobe_refill, dbg_status, dbg_status0,
+	    dma_chan_status;
+
+	if ((dev->flags & IFF_UP) == 0)
+		return 0;
+
+	seq_puts(seq, "=== Rx data path diag info ===\n");
+
+	mac_dbg = readl(priv->ioaddr + XGMAC_MAC_DEBUG);
+	seq_printf(seq,
+			"[reg 114]MAC RX dbg, MAC GMII/XGMII rx engine st:%#x, MAC rx small fifo st:%#x, reg:%#x\n",
+			(mac_dbg & 0x1), (mac_dbg & 0x6 >> 1),
+			mac_dbg);
+	for (queue = 0; queue <= DN200_LAST_QUEUE(priv); queue++) {
+		if (DN200_MTL_QUEUE_IS_VALID(priv, queue)) {
+			mtl_rxq_dbg = readl(priv->ioaddr + XGMAC_MTL_RXQ_DEBUG(queue));
+			seq_printf(seq,
+					"[reg 1148]MTL FIFO:%d, Pkt Num in RXQ:%ld, RXQ fill level:%#lx, reg:%#x\n", queue,
+					((mtl_rxq_dbg & XGMAC_PRXQ) >> 16),
+					((mtl_rxq_dbg & XGMAC_RXQSTS) >> 4),
+					mtl_rxq_dbg);
+		}
+	}
+
+	for (queue = 0; queue < rx_count; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		hw_chan = queue + DN200_RXQ_START_GET(priv->hw);
+
+		/* get sw & hw ring status, include sw to be refilled descs number,
+		 * hw available descs number
+		 */
+		ch_tail = readl(priv->ioaddr +
+				XGMAC_DMA_CH_RxDESC_TAIL_LPTR(hw_chan));
+		ch_curr = readl(priv->ioaddr +
+				XGMAC_DMA_CH_RxDESC_CURR_LPTR(hw_chan));
+		avail = (ch_curr > ch_tail) ? (priv->dma_rx_size -
+					       ((ch_curr - ch_tail) / 16 - 1)) :
+					      ((ch_tail - ch_curr) / 16 + 1);
+		tobe_refill = DN200_DESC_UNUSED(priv, rx_q);
+
+		seq_printf(seq, "RX Queue %d channel %d %s\n", queue, hw_chan,
+			   (tobe_refill > 0) ? "- running" : "");
+
+		seq_printf(seq,
+			   "Desc sw ring curr_rx:%d, dirty_rx:%d, tobe refilled:%d, rxi_us:%d, result:%s\n",
+			   rx_q->cur_rx, rx_q->dirty_rx, tobe_refill, priv->rx_intr[queue].target_itr,
+			   tobe_refill >
+			   (priv->dma_rx_size /
+			    2) ? "no (abnormal)" : "yes (good)");
+
+		seq_printf(seq,
+			   "Desc hw ring start:%#x, tail:%#x, curr:%#x, available:%u, result:%s.\n",
+			   readl(priv->ioaddr +
+				 XGMAC_DMA_CH_RxDESC_LADDR(hw_chan)), ch_tail,
+			   ch_curr, avail,
+			   avail >
+			   (priv->dma_rx_size /
+			    4) ? "yes (good)" : "no (abnormal)");
+
+		/*2. if hw ring tail equal curr, or RBU/FBE occur, get the interrupt status */
+		/* if hw ring tail == curr, means the hw have no descs to use, maybe sw stop to rx,
+		 * e.g wihtout any hw interrupt is notified to cpu
+		 *
+		 * if RBU bit is set, means Receive Buffer Unavailable,
+		 * or the application owns the next descriptor
+		 * in the Receive list, and the DMA cannot acquire it.
+		 * The Rx process is suspended.
+		 * e.g. rx interrupt slow to trigger caused by firmware local cpu to
+		 * process other jobs
+		 */
+		dma_chan_status =
+		    readl(priv->ioaddr + XGMAC_DMA_CH_STATUS(hw_chan));
+		/* RBU - rx buffer unavailable, FBE - fatal bus error */
+		seq_printf(seq,
+			   "[reg 3160]%s, all queue rx_normal_irq_n:%llu, reg val:%#x.\n",
+			   ((ch_tail == ch_curr) ||
+				 (dma_chan_status & (XGMAC_RBU | XGMAC_FBE))) ?
+			   "Error:slow or no itr lead to RBU or hw ring empty(or FBE)"
+			   : "Good:rx dma channel status ok.",
+			   priv->xstats.rx_normal_irq_n, dma_chan_status);
+
+		/*3. get descs cache level */
+		cache_lvl =
+		    readl(priv->ioaddr + XGMAC_CH_DESC_CACHE_LVL(hw_chan));
+		seq_printf(seq,
+			   "[reg 3168]Desc rx cache level is:%lu, tx cache levle is:%lu, cache_lvl:%#x\n",
+			   (cache_lvl & XGMAC_RXLVL) >> XGMAC_RXLVL_SHIFT,
+			   (cache_lvl & XGMAC_TXLVL), cache_lvl);
+
+		/*4. get dma channel debug status */
+		dbg_status = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(hw_chan));
+		seq_printf(seq,
+			   "[reg 3164]DMA channel RDWS-des write state:%#lx, RDTS-data transfer state:%#lx, RDFS-des fetch state:%#lx, dbg_status:%#x\n",
+			   (dbg_status & XGMAC_RDWS) >> XGMAC_RDWS_SHIFT,
+			   (dbg_status & XGMAC_RDTS) >> XGMAC_RDTS_SHIFT,
+			   (dbg_status & XGMAC_RDFS) >> XGMAC_RDFS_SHIFT,
+			   dbg_status);
+
+		/*5. get rx dma debug status0(dma_debug_status0) */
+		dbg_status0 = readl(priv->ioaddr + XGMAC_DEBUG_ST0);
+		seq_printf(seq,
+			   "[reg 3020]AXI Master Write Channel is%s active(rx global status), dma_debug_status0:%#x.\n",
+			   (dbg_status0 & XGMAC_AXWHSTS) ? "" : " Not",
+			   dbg_status0);
+	}
+
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_rx_diag);
+
+static int dn200_rx_buf_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 rx_count = priv->plat->rx_queues_to_use;
+	u32 queue, cache_id;
+	u32 head, tail, stride, avail_entry;
+	struct dn200_bufring *r;
+	struct dn200_page_buf *rx_buf;
+	int idx = 0;
+
+	if ((dev->flags & IFF_UP) == 0)
+		return 0;
+
+	seq_puts(seq, "=== Rx Buffer debug info ===\n");
+
+	for (queue = 0; queue < rx_count; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+		struct dn200_bufpool_cache *local_cache =
+		    &rx_q->rx_pool->local_cache[queue];
+		struct dn200_buf_cache_ring *buf_cached =
+		    &local_cache->buf_cached;
+		struct dn200_buf_refill_stack *buf_refill =
+		    &local_cache->buf_refill;
+		seq_printf(seq, "=== Rx buf queue:%d ===\n", queue);
+		seq_printf(seq,
+			   "-- Refill stack: cache size:%d, current len:%d --\n",
+			   buf_refill->cache_size, buf_refill->current_len);
+
+		for (cache_id = 0; cache_id < buf_refill->current_len;
+		     ++cache_id) {
+			rx_buf =
+			    (struct dn200_page_buf *)buf_refill->objs[cache_id];
+			if (!rx_buf->page)
+				continue;
+
+			seq_printf(seq,
+				   "cache id:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+				   cache_id, rx_buf->kernel_addr,
+				   rx_buf->desc_addr, rx_buf->page_offset,
+				   page_to_phys(rx_buf->page), rx_buf->buf_len,
+				   rx_buf->busy_cnt, rx_buf->page,
+				   page_ref_count(rx_buf->page));
+		}
+
+		seq_printf(seq,
+			   "-- Cache ring: cache size:%d, head:%d, tail:%d --\n",
+			   buf_cached->cache_size, buf_cached->head,
+			   buf_cached->tail);
+		if (buf_cached->head >= buf_cached->tail) {
+			for (cache_id = buf_cached->tail;
+			     cache_id < buf_cached->head; cache_id++) {
+				rx_buf =
+				    (struct dn200_page_buf *)buf_cached->objs[cache_id];
+				if (!rx_buf->page)
+					continue;
+
+				seq_printf(seq,
+					   "cache id:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+					   cache_id, rx_buf->kernel_addr,
+					   rx_buf->desc_addr,
+					   rx_buf->page_offset,
+					   page_to_phys(rx_buf->page),
+					   rx_buf->buf_len, rx_buf->busy_cnt,
+					   rx_buf->page,
+					   page_ref_count(rx_buf->page));
+			}
+		} else {
+			for (cache_id = buf_cached->tail;
+			     cache_id < buf_cached->cache_size; cache_id++) {
+				rx_buf =
+				    (struct dn200_page_buf *)buf_cached->objs[cache_id];
+				if (!rx_buf->page)
+					continue;
+
+				seq_printf(seq,
+					   "cache id:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+					   cache_id, rx_buf->kernel_addr,
+					   rx_buf->desc_addr,
+					   rx_buf->page_offset,
+					   page_to_phys(rx_buf->page),
+					   rx_buf->buf_len, rx_buf->busy_cnt,
+					   rx_buf->page,
+					   page_ref_count(rx_buf->page));
+			}
+			for (cache_id = 0; cache_id < buf_cached->head;
+			     cache_id++) {
+				rx_buf = (struct dn200_page_buf *)buf_cached->objs[cache_id];
+				if (!rx_buf->page)
+					continue;
+
+				seq_printf(seq,
+					   "cache id:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+					   cache_id, rx_buf->kernel_addr,
+					   rx_buf->desc_addr,
+					   rx_buf->page_offset,
+					   page_to_phys(rx_buf->page),
+					   rx_buf->buf_len, rx_buf->busy_cnt,
+					   rx_buf->page,
+					   page_ref_count(rx_buf->page));
+			}
+		}
+	}
+	r = priv->buf_pool.pool_ring;
+	head = atomic_read(&r->prod.tail);
+	tail = atomic_read(&r->cons.head);
+	if (head >= tail)
+		avail_entry = head - tail;
+	else
+		avail_entry = r->ring_size + head - tail;
+	seq_printf(seq,
+		   "==== total rx_buf %d pool_ring availd %d head %d tail %d ring_size %d====\n",
+		   priv->page_pool.total_pages *
+		   priv->buf_pool.buf_num_per_page, avail_entry, head, tail,
+		   r->ring_size);
+
+	r = priv->buf_pool.cached_ring;
+	head = atomic_read(&r->prod.tail);
+	tail = atomic_read(&r->cons.head);
+	if (head >= tail)
+		avail_entry = head - tail;
+	else
+		avail_entry = r->ring_size + head - tail;
+
+	seq_printf(seq,
+		   "====cached_ring availd %d head %d tail %d ring_size %d====\n",
+		   avail_entry, head, tail, r->ring_size);
+	stride = priv->buf_pool.buf_num_per_page;
+	if (head >= tail) {
+		for (idx = tail; idx < head; idx += stride) {
+			rx_buf = (struct dn200_page_buf *)r->ring_objs[idx];
+			if (page_ref_count(rx_buf->page) == 1)
+				continue;
+
+			seq_printf(seq,
+				   "idx:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+				   idx, rx_buf->kernel_addr, rx_buf->desc_addr,
+				   rx_buf->page_offset,
+				   page_to_phys(rx_buf->page), rx_buf->buf_len,
+				   rx_buf->busy_cnt, rx_buf->page,
+				   page_ref_count(rx_buf->page));
+		}
+	} else {
+		for (idx = tail; idx < r->ring_size; idx += stride) {
+			rx_buf = (struct dn200_page_buf *)r->ring_objs[idx];
+			if (page_ref_count(rx_buf->page) == 1)
+				continue;
+
+			seq_printf(seq,
+				   "idx:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+				   idx, rx_buf->kernel_addr, rx_buf->desc_addr,
+				   rx_buf->page_offset,
+				   page_to_phys(rx_buf->page), rx_buf->buf_len,
+				   rx_buf->busy_cnt, rx_buf->page,
+				   page_ref_count(rx_buf->page));
+		}
+		for (idx = 0; idx < head; idx += stride) {
+			rx_buf = (struct dn200_page_buf *)r->ring_objs[idx];
+			if (page_ref_count(rx_buf->page) == 1)
+				continue;
+
+			seq_printf(seq,
+				   "idx:%d, dma:%#llx, descaddr:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d\n",
+				   idx, rx_buf->kernel_addr, rx_buf->desc_addr,
+				   rx_buf->page_offset,
+				   page_to_phys(rx_buf->page), rx_buf->buf_len,
+				   rx_buf->busy_cnt, rx_buf->page,
+				   page_ref_count(rx_buf->page));
+		}
+	}
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_rx_buf);
+
+static int dn200_ring_status_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	int rx_chan = priv->plat->rx_queues_to_use;
+	struct dn200_rx_queue *rx_q = NULL;
+	int i = 0;
+	int count_rx = 0;
+
+	if (!netif_running(dev))
+		return 0;
+	seq_puts(seq, "=== Get RX ring status ===\n");
+	for (; i < rx_chan; i++) {
+		rx_q = &priv->rx_queue[i];
+		seq_printf(seq, "queue %d cur_rx %#x dirty_rx %#x\n", i,
+			   rx_q->cur_rx, rx_q->dirty_rx);
+		seq_printf(seq, "init %#x tail_phy %#x curr %#x\n",
+			   readl(priv->ioaddr + XGMAC_DMA_CH_RxDESC_LADDR(i)),
+			   readl(priv->ioaddr +
+				 XGMAC_DMA_CH_RxDESC_TAIL_LPTR(i)),
+			   readl(priv->ioaddr +
+				 XGMAC_DMA_CH_RxDESC_CURR_LPTR(i)));
+		if (readl(priv->ioaddr + XGMAC_DMA_CH_RxDESC_TAIL_LPTR(i)) ==
+		    readl(priv->ioaddr + XGMAC_DMA_CH_RxDESC_CURR_LPTR(i))) {
+			count_rx = dn200_rx(priv, 64, i, priv->dma_rx_size);
+			seq_printf(seq, "queue %d dn200_rx reture %d\n", i,
+				   count_rx);
+		}
+	}
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_ring_status);
+
+static int dn200_hw_lock_test_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	unsigned long start_time, end_time;
+	bool is_locked = false;
+	int ret = 0;
+
+	seq_puts(seq, "=== Test HW lock & unlock ===\n");
+	start_time = ktime_get_ns();
+	ret = dn200_hw_lock(priv->hw, &is_locked);
+	end_time = ktime_get_ns();
+	if (ret == 0) {
+		seq_printf(seq, "HW locked successful, ret: %d, cost ns:%lu\n",
+			ret, end_time - start_time);
+		start_time = ktime_get_ns();
+		dn200_hw_unlock(priv->hw, &is_locked);
+		end_time = ktime_get_ns();
+		if (is_locked == false) {
+			seq_printf(seq, "HW unlock successful, ret: %d, cost ns:%lu\n",
+				ret, end_time - start_time);
+		}
+	} else {
+		seq_printf(seq, "HW lock failure, ret: %d\n", ret);
+	}
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_hw_lock_test);
+
+static int dn200_fifo_size_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	int rx_chan = priv->plat_ex->rx_queues_total;
+	int tx_chan = priv->plat_ex->tx_queues_total;
+	int i = 0;
+	u32 fifosize = 0;
+	u32 value = 0;
+	bool pf_use = false;
+	bool vf_use = false;
+
+	if (PRIV_IS_VF(priv))
+		return 0;
+	if (!netif_running(dev))
+		return 0;
+
+	seq_puts(seq, "=== GET HW RX FIFOSIZE ===\n");
+	for ( ; i < rx_chan; i++) {
+		value = readl(priv->ioaddr + XGMAC_MTL_RXQ_OPMODE(i));
+		fifosize = (value & XGMAC_RQS) >> XGMAC_RQS_SHIFT;
+		fifosize = (fifosize + 1) * 256;
+		seq_printf(seq, "mtl rx chan: %#x fifosize %#x\n",
+			   i, fifosize);
+	}
+
+	seq_puts(seq, "=== GET HW TX FIFOSIZE ===\n");
+	for (i = 0; i < tx_chan; i++) {
+		value = readl(priv->ioaddr + XGMAC_MTL_TXQ_OPMODE(i));
+		fifosize = (value & XGMAC_TQS) >> XGMAC_TQS_SHIFT;
+		fifosize = (fifosize + 1) * 256;
+		if (i < priv->plat->tx_queues_to_use)
+			pf_use = true;
+		else
+			pf_use = false;
+		if (PRIV_SRIOV_SUPPORT(priv)) {
+			if (i >= 8)
+				vf_use = true;
+			else
+				vf_use = false;
+		}
+		seq_printf(seq, "mtl tx chan: %#x fifosize %#x %s\n",
+				 i, fifosize,
+				 vf_use ? "vf use" : (pf_use ? "pf enable" : "pf disable"));
+	}
+	return 0;
+}
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_fifo_size);
+
+static int dn200_rxp_satus_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	dn200_rxp_filter_get(priv, priv->hw, seq);
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_rxp_satus);
+
+static int dn200_iatu_show(struct seq_file *seq, void *v)
+{
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return 0;
+
+	dn200_iatu_display(priv, seq);
+	return 0;
+}
+
+DN200_DEFINE_SHOW_ATTRIBUTE(dn200_iatu);
+
+static char dn200_ext_phy_buf[128];
+static int dn200_ext_phy_show(struct seq_file *seq, void *v)
+{
+	if (strlen(dn200_ext_phy_buf))
+		seq_printf(seq, "%s\n", dn200_ext_phy_buf);
+	return 0;
+}
+
+static ssize_t dn200_ext_phy_write(struct file *file,
+		 const char __user *buf, size_t size, loff_t *ppos)
+{
+	struct seq_file *seq = file->private_data;
+	struct net_device *dev = seq->private;
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct phy_device *phydev;
+	char temp[64] = {};
+	char *cur;
+	unsigned long value;
+	int addr;
+	u32 reg = 0, val = 0, len = 0;
+
+	addr = priv->plat->phy_addr;
+	phydev = mdiobus_get_phy(priv->mii, addr);
+	if (!phydev)
+		return size;
+
+	if (size > sizeof(temp)) {
+		sprintf(dn200_ext_phy_buf, "cmd is too long\n");
+		return size;
+	}
+	if (copy_from_user(temp, buf, size))
+		return size;
+
+	cur = strchr(temp, ' ');
+	if (!cur) {
+		sprintf(dn200_ext_phy_buf, "\"r reg\" or \"w reg val\"\n");
+		return 0;
+	}
+	while (*cur == ' ')
+		cur++;
+
+	if (kstrtoul(cur, 0, &value))
+		return 0;
+	reg = value;
+
+	cur = strchr(cur, ' ');
+	if (cur) {
+		while (*cur == ' ')
+			cur++;
+		if (kstrtoul(cur, 0, &value))
+			return 0;
+		val = value;
+	}
+
+	if (reg >= 0x20) {
+		reg &= 0xffff;
+		if (temp[0] == 'w') {
+			ytphy_write_ext(phydev, reg, val);
+			len = sprintf(dn200_ext_phy_buf,
+				 "Write phy %d ext.%#x %#x\n", addr, reg, val);
+		}
+
+		sprintf(dn200_ext_phy_buf + len,
+				 "Read phy %d ext.%#x val: %#x\n", addr, reg,
+			ytphy_read_ext(phydev, reg));
+	} else {
+		if (temp[0] == 'w') {
+			mdiobus_write(priv->mii, priv->plat->phy_addr, reg, val);
+			len = sprintf(dn200_ext_phy_buf,
+					 "Write phy %d %#x %#x\n", addr, reg, val);
+		}
+
+		sprintf(dn200_ext_phy_buf + len,
+				 "Read phy %d %#x val: %#x\n", addr, reg,
+			mdiobus_read(priv->mii, priv->plat->phy_addr, reg));
+	}
+
+	return size;
+}
+
+static int dn200_ext_phy_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dn200_ext_phy_show, inode->i_private);
+}
+
+static const struct file_operations dn200_ext_phy_fops = {
+	.owner = THIS_MODULE,
+	.open = dn200_ext_phy_open,
+	.read = seq_read,
+	.write = dn200_ext_phy_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void dn200_init_fs(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	rtnl_lock();
+	/* Create per netdev entries */
+	priv->dbgfs_dir = debugfs_create_dir(dev->name, NULL);
+
+	/* Entry to report DMA RX/TX rings */
+	debugfs_create_file("descriptors_status", 0444, priv->dbgfs_dir, dev,
+			    &dn200_rings_status_fops);
+
+	/* Entry to report the DMA HW features */
+	debugfs_create_file("dma_cap", 0444, priv->dbgfs_dir, dev,
+			    &dn200_dma_cap_fops);
+
+	debugfs_create_file("ring_status", 0444, priv->dbgfs_dir, dev,
+			    &dn200_ring_status_fops);
+
+	debugfs_create_file("lock_test", 0444, priv->dbgfs_dir, dev,
+			    &dn200_hw_lock_test_fops);
+
+	debugfs_create_file("fifo_size", 0444, priv->dbgfs_dir, dev,
+			    &dn200_fifo_size_fops);
+
+	debugfs_create_file("rxp_status", 0444, priv->dbgfs_dir, dev,
+			    &dn200_rxp_satus_fops);
+
+	/* Entry to report TX/RX data path diagnostic info */
+	debugfs_create_file("tx_diag", 0444, priv->dbgfs_dir, dev,
+			    &dn200_tx_diag_fops);
+	debugfs_create_file("rx_diag", 0444, priv->dbgfs_dir, dev,
+			    &dn200_rx_diag_fops);
+
+	debugfs_create_file("rx_buf", 0444, priv->dbgfs_dir, dev,
+			    &dn200_rx_buf_fops);
+	debugfs_create_file("iatu", 0444, priv->dbgfs_dir, dev,
+			    &dn200_iatu_fops);
+	if (priv->mii)
+		debugfs_create_file("ext_phy", 0644, priv->dbgfs_dir, dev,
+			    &dn200_ext_phy_fops);
+
+	rtnl_unlock();
+}
+
+static void dn200_exit_fs(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	debugfs_remove_recursive(priv->dbgfs_dir);
+}
+
+static int dn200_vlan_update(struct dn200_priv *priv)
+{
+	int count = 0;
+	u16 vid = 0;
+
+	for_each_set_bit(vid, priv->active_vlans, VLAN_N_VID) {
+		if (count >= 4095)
+			return -1;
+		count++;
+	}
+	priv->plat_ex->vlan_num = count;
+	if (!(PRIV_IS_PUREPF(priv) || PRIV_IS_VF(priv)) &&
+		!test_bit(DN200_DEV_INIT, &priv->state))
+		_dn200_config_vlan_rx_fltr(priv, priv->hw,
+					  !(priv->dev->flags & IFF_PROMISC));
+	return priv->plat_ex->vlan_num;
+}
+
+static int dn200_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid)
+{
+	struct dn200_priv *priv = netdev_priv(ndev);
+	bool is_double = false;
+	int ret;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	if (HW_IS_VF(priv->hw))
+		return 0;
+	if (be16_to_cpu(proto) == ETH_P_8021AD)
+		is_double = true;
+
+	if (is_double && !HW_IS_PUREPF(priv->hw))
+		return -EOPNOTSUPP;
+
+	set_bit(vid, priv->active_vlans);
+	ret = dn200_vlan_update(priv);
+	if (ret < 0) {
+		clear_bit(vid, priv->active_vlans);
+		return ret;
+	}
+	if (HW_IS_PUREPF(priv->hw)) {
+		ret =
+		    dn200_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid,
+					      0, 0);
+		if (ret)
+			return ret;
+	} else {
+		priv->pf_rxp_set |= RXP_SET_VLAN_ID;
+		if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+			queue_work(priv->wq, &priv->rxp_task);
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+	}
+	return 0;
+}
+
+static void dn200_vlan_reconfig(struct dn200_priv *priv)
+{
+	u16 vid = 0;
+
+	if (PRIV_IS_VF(priv))
+		return;
+
+	if (HW_IS_PUREPF(priv->hw)) {
+		for_each_set_bit(vid, priv->active_vlans, VLAN_N_VID) {
+			__le16 vid_le = cpu_to_le16(vid);
+
+			dn200_add_hw_vlan_rx_fltr(priv, priv->dev, priv->hw, 0,
+						  vid_le, 0, 0);
+		}
+	} else {
+		priv->pf_rxp_set |= RXP_SET_VLAN_ID;
+		if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+			queue_work(priv->wq, &priv->rxp_task);
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+	}
+}
+
+static int dn200_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto,
+				  u16 vid)
+{
+	struct dn200_priv *priv = netdev_priv(ndev);
+	bool is_double = false;
+	int ret;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	if (HW_IS_VF(priv->hw))
+		return 0;
+	if (be16_to_cpu(proto) == ETH_P_8021AD)
+		is_double = true;
+
+	clear_bit(vid, priv->active_vlans);
+	ret = dn200_vlan_update(priv);
+	if (ret < 0)
+		return ret;
+
+	if (HW_IS_PUREPF(priv->hw)) {
+		ret =
+		    dn200_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid,
+					      0, 0);
+		if (ret)
+			return ret;
+	} else {
+		priv->pf_rxp_set |= RXP_SET_VLAN_ID;
+		if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+			queue_work(priv->wq, &priv->rxp_task);
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+	}
+	return 0;
+}
+
+void dn200_disable_rx_queue(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	dn200_disable_dma_irq(priv, priv->ioaddr, queue, 1, 0, priv->hw);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	dn200_stop_rx_dma(priv, queue);
+	__free_dma_rx_desc_resources(priv, queue);
+}
+
+void dn200_enable_rx_queue(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned long flags;
+	int ret;
+
+	ret = __alloc_dma_rx_desc_resources(priv, queue);
+	if (ret) {
+		netdev_err(priv->dev, "Failed to alloc RX desc.\n");
+		return;
+	}
+
+	ret = __init_dma_rx_desc_rings(priv, queue, GFP_KERNEL);
+	if (ret) {
+		__free_dma_rx_desc_resources(priv, queue);
+		netdev_err(priv->dev, "Failed to init RX desc.\n");
+		return;
+	}
+
+	dn200_clear_rx_descriptors(priv, queue);
+
+	dn200_init_rx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+			   rx_q->dma_rx_phy, rx_q->queue_index, priv->hw);
+
+	rx_q->rx_tail_addr = rx_q->dma_rx_phy + (rx_q->buf_alloc_num *
+						 sizeof(struct dma_desc));
+	dn200_set_rx_tail_ptr(priv, priv->ioaddr,
+			      rx_q->rx_tail_addr, rx_q->queue_index, priv->hw);
+	dn200_set_dma_bfsize(priv, priv->ioaddr,
+			     priv->dma_buf_sz, rx_q->queue_index, priv->hw);
+	dn200_start_rx_dma(priv, queue);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	dn200_enable_dma_irq(priv, priv->ioaddr, queue, 1, 0, priv->hw);
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+void dn200_disable_tx_queue(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	dn200_disable_dma_irq(priv, priv->ioaddr, queue, 0, 1, priv->hw);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	dn200_stop_tx_dma(priv, queue);
+	__free_dma_tx_desc_resources(priv, queue);
+}
+
+void dn200_enable_tx_queue(struct dn200_priv *priv, u32 queue)
+{
+	struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+	struct dn200_channel *ch = &priv->channel[queue];
+	unsigned long flags;
+	int ret;
+
+	ret = __alloc_dma_tx_desc_resources(priv, queue);
+	if (ret) {
+		netdev_err(priv->dev, "Failed to alloc TX desc.\n");
+		return;
+	}
+
+	ret = __init_dma_tx_desc_rings(priv, queue);
+	if (ret) {
+		__free_dma_tx_desc_resources(priv, queue);
+		netdev_err(priv->dev, "Failed to init TX desc.\n");
+		return;
+	}
+
+	dn200_clear_tx_descriptors(priv, queue);
+
+	dn200_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+			   tx_q->dma_tx_phy, tx_q->queue_index, priv->hw);
+
+	tx_q->tx_tail_addr = tx_q->dma_tx_phy;
+	dn200_set_tx_tail_ptr(priv, priv->ioaddr,
+			      tx_q->tx_tail_addr, tx_q->queue_index, priv->hw);
+
+	dn200_start_tx_dma(priv, queue);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	dn200_enable_dma_irq(priv, priv->ioaddr, queue, 0, 1, priv->hw);
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+static netdev_features_t
+dn200_features_check(struct sk_buff *skb,
+		     struct net_device __always_unused *netdev,
+		     netdev_features_t features)
+{
+	/* No point in doing any of this if neither checksum nor GSO are
+	 * being requested for this frame.  We can rule out both by just
+	 * checking for CHECKSUM_PARTIAL
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return features;
+
+	/* We cannot support GSO if the MSS is going to be less than
+	 * 64 bytes.  If it is then we need to drop support for GSO.
+	 */
+	if (skb_is_gso(skb) && (skb_shinfo(skb)->gso_size < 64))
+		features &= ~NETIF_F_GSO_MASK;
+
+	return features;
+}
+
+static void dn200_get_stats64(struct net_device *netdev,
+			      struct rtnl_link_stats64 *s)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_counters *mmc = &priv->mmc;
+
+	if (test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return;
+	if (PRIV_IS_VF(priv)) {
+		s->rx_packets = netdev->stats.rx_packets;
+		s->rx_bytes = netdev->stats.rx_bytes;
+		s->rx_errors = netdev->stats.rx_errors;
+		s->rx_dropped = netdev->stats.rx_dropped;
+
+		s->tx_packets = netdev->stats.tx_packets;
+		s->tx_bytes = netdev->stats.tx_bytes;
+		s->tx_errors = netdev->stats.tx_errors;
+		s->tx_dropped = netdev->stats.tx_dropped;
+	} else {
+		if (!test_bit(DN200_SUSPENDED, &priv->state) && !test_bit(DN200_DOWN, &priv->state))
+			dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+
+		s->rx_packets = netdev->stats.rx_packets;
+		s->rx_bytes = netdev->stats.rx_bytes;
+		s->rx_errors = mmc->mmc_rx_crc_error +
+		    mmc->mmc_rx_align_error +
+		    mmc->mmc_rx_run_error +
+		    mmc->mmc_rx_jabber_error +
+		    mmc->mmc_rx_length_error +
+		    mmc->mmc_rx_watchdog_error +
+		    mmc->mmc_rx_udp_err +
+		    mmc->mmc_rx_tcp_err +
+		    mmc->mmc_rx_icmp_err +
+		    mmc->mmc_rx_packet_assembly_err_cntr;
+		s->multicast = mmc->mmc_rx_multicastframe_g;
+		s->rx_length_errors = mmc->mmc_rx_length_error;
+		s->rx_crc_errors = mmc->mmc_rx_crc_error;
+
+		s->tx_packets = netdev->stats.tx_packets;
+		s->tx_bytes = netdev->stats.tx_bytes;
+		s->tx_errors = mmc->mmc_tx_underflow_error;
+		s->tx_dropped = netdev->stats.tx_dropped;
+	}
+}
+
+static void dn200_linkset_subtask(struct dn200_priv *priv)
+{
+	int retry = 0;
+
+	netdev_info(priv->dev, "vf adaptor link status set.\n");
+	if (test_bit(DN200_DOWN, &priv->state) &&
+			 (priv->vf_link_action & LINK_DOWN_SET))
+		return;
+
+	if (!test_bit(DN200_DOWN, &priv->state) &&
+			 (priv->vf_link_action & LINK_UP_SET))
+		return;
+
+	if (test_bit(DN200_DOWN, &priv->state) &&
+			 (priv->vf_link_action & LINK_UP_SET)) {
+		netif_trans_update(priv->dev);
+		rtnl_lock();
+		dev_open(priv->dev, NULL);
+		rtnl_unlock();
+	} else if (!test_bit(DN200_DOWN, &priv->state) &&
+			 (priv->vf_link_action & LINK_DOWN_SET)) {
+		netif_trans_update(priv->dev);
+		while (test_and_set_bit(DN200_RESETING, &priv->state)) {
+			usleep_range(1000, 2000);
+			if (retry++ >= 3)
+				return;
+		}
+		rtnl_lock();
+		dev_close(priv->dev);
+		rtnl_unlock();
+		clear_bit(DN200_RESETING, &priv->state);
+	}
+	priv->vf_link_action = 0;
+}
+
+static void dn200_linkset_task(struct work_struct *work)
+{
+	struct dn200_priv *priv = container_of(work, struct dn200_priv,
+					       vf_linkset_task);
+
+	dn200_linkset_subtask(priv);
+}
+
+static void dn200_set_vf_rxp(struct dn200_priv *priv, u8 vf_num, struct dn200_vf_rxp_async_info *info)
+{
+	if (info->type & DN200_VF_CLEAR_RXP) {
+		dn200_wq_vf_del_rxp(priv, priv->hw, 1 + vf_num, info->rxq_start);
+	} else {
+		dn200_vf_append_rxp_bc(priv, priv->hw, info->rxq_start);
+		//if (info->type & DN200_VF_SET_UMAC)
+			dn200_wq_set_umac_addr(priv, priv->hw, NULL, 0, info);
+		//if (info->type & DN200_VF_SET_FLT)
+			dn200_wq_set_filter(priv, priv->hw, priv->dev, true, info);
+	}
+}
+
+static void dn200_set_pf_rxp(struct dn200_priv *priv, struct dn200_vf_rxp_async_info *info)
+{
+	u8 vf_offset = 0;
+
+	if (priv->pf_rxp_set & RXP_SET_UMAC) {
+		priv->pf_rxp_set &= ~RXP_SET_UMAC;
+		dn200_wq_set_umac_addr(priv, priv->hw, (unsigned char *)priv->dev->dev_addr, 0, info);
+	}
+	if (priv->pf_rxp_set & RXP_SET_FIL) {
+		priv->pf_rxp_set &= ~RXP_SET_FIL;
+		dn200_wq_set_filter(priv, priv->hw, priv->dev, false, NULL);
+	}
+	if (priv->pf_rxp_set & RXP_SET_VLAN_FIL) {
+		priv->pf_rxp_set &= ~RXP_SET_VLAN_FIL;
+		dn200_config_vlan_rx_fltr(priv, priv->hw, priv->vlan_fil_enable);
+	}
+	if (priv->pf_rxp_set & RXP_SET_VLAN_ID) {
+		priv->pf_rxp_set &= ~RXP_SET_VLAN_ID;
+		dn200_sriov_vlan_entry_update(priv);
+	}
+	if (priv->pf_rxp_set & RXP_CLEAR_VF_RXP) {
+		priv->pf_rxp_set &= ~RXP_CLEAR_VF_RXP;
+		for (vf_offset = 0; vf_offset < DN200_MAX_VF_NUM; vf_offset++) {
+			if (priv->clear_vf_rxp_bitmap & (1 << vf_offset)) {
+				dn200_clear_vf_rxp(priv, priv->hw, vf_offset);
+				priv->clear_vf_rxp_bitmap &= ~(1 << vf_offset);
+			}
+		}
+	}
+}
+
+void dn200_async_rxp_work(struct dn200_priv *priv)
+{
+	netdev_dbg(priv->dev, "%s: pf ready to set rxp.\n", __func__);
+	if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+		queue_work(priv->wq, &priv->rxp_task);
+	else if (PRIV_SRIOV_SUPPORT(priv))
+		set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+}
+
+static void dn200_rxp_task(struct work_struct *work)
+{
+	struct dn200_priv *priv = container_of(work, struct dn200_priv,
+					       rxp_task);
+	struct dn200_vf_rxp_async_info *info;
+	struct dn200_vf_rxp_async_info *lram_info;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_info);
+	u8 i = 0;
+	u32 tmp_crc32;
+
+	if (!netif_carrier_ok(priv->dev)) {
+		set_bit(DN200_RXP_NEED_CHECK, &priv->state);
+		return;
+	}
+	if (test_and_set_bit(DN200_RXP_SETTING, &priv->state))
+		return;
+
+	lram_info = devm_kzalloc(priv->device, info_size, GFP_KERNEL);
+	if (!lram_info)
+		return;
+	/* check priv wq status first */
+	if (priv->pf_rxp_set)
+		netdev_dbg(priv->dev, "%s %d pf_set_rxp %#x\n", __func__, __LINE__, priv->pf_rxp_set);
+	if (priv->pf_rxp_set != 0)
+		dn200_set_pf_rxp(priv, lram_info);
+
+	/* check vf wq status*/
+	for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++) {
+		info = &priv->async_info[i];
+		dn200_get_lram_rxp_async_info(priv->hw, (u8 *)lram_info, i);
+		if (lram_info->crc32)
+			netdev_dbg(priv->dev, "%s %d vf %d cfg_seq %d set seq %d, cfg crc32 %#x cur_crc32 %#x\n", __func__, __LINE__, i,
+				lram_info->seq, info->seq, lram_info->crc32, info->crc32);
+		if (info->seq == lram_info->seq && info->crc32 == lram_info->crc32)
+			continue;
+		tmp_crc32 = crc32_le(~0, (u8 *)lram_info + sizeof(u32), info_size - sizeof(u32));
+		netdev_dbg(priv->dev, "%s %d vf %d tmp crc32 %#x\n", __func__, __LINE__, i, tmp_crc32);
+		if (lram_info->crc32 != tmp_crc32)
+			continue;
+		memcpy(info, lram_info, info_size);
+		dn200_set_vf_rxp(priv, i, info);
+	}
+	devm_kfree(priv->device, lram_info);
+	clear_bit(DN200_RXP_SETTING, &priv->state);
+}
+
+static void dn200_uc_addr_get(struct dn200_priv *priv, int vf_id, u8 *addr);
+static void dn200_uc_addr_set(struct dn200_priv *priv, int vf_id, u8 *addr);
+static int dn200_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
+{
+	u8 mac_addr[ETH_ALEN];
+	struct dn200_priv *priv = netdev_priv(netdev);
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	if (PRIV_IS_PUREPF(priv)) {
+		netdev_warn(priv->dev, "PURE PF not support vf\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!priv->plat_ex->pf.registered_vfs) {
+		netdev_warn(priv->dev, "VF not exist\n");
+		return -EAGAIN;
+	} else if (vf_id >= priv->plat_ex->pf.registered_vfs) {
+		netdev_err(priv->dev, "Invalid VF Identifier %d\n", vf_id);
+		return -EINVAL;
+	}
+
+	if (is_valid_ether_addr(mac)) {
+		netdev_info(priv->dev, "setting MAC %pM on VF %d\n",
+			    mac, vf_id);
+		dn200_uc_addr_get(priv, vf_id, mac_addr);
+		if (memcmp(mac, mac_addr, ETH_ALEN) == 0) {
+			netdev_warn(priv->dev,
+				    "setting MAC and existed mac are the same\n");
+			return 0;
+		}
+		dn200_uc_addr_set(priv, vf_id, mac);
+		DN200_ITR_SYNC_SET(priv->hw, vf_reset_mac_list, vf_id, 1);
+		irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * dn200_ndo_get_vf_config
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @ivi: VF configuration structure
+ *
+ * return VF configuration
+ **/
+static int dn200_ndo_get_vf_config(struct net_device *netdev,
+				   int vf_id, struct ifla_vf_info *ivi)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	u8 mac[ETH_ALEN];
+	int ret = 0;
+	u8 vf_link_notify = 0;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+	ivi->vf = vf_id;
+	if (!priv->plat_ex->pf.registered_vfs) {
+		netdev_warn(priv->dev, "VF not exist\n");
+		return -EAGAIN;
+	} else if (vf_id >= priv->plat_ex->pf.registered_vfs) {
+		netdev_err(priv->dev, "Invalid VF Identifier %d\n", vf_id);
+		return -EINVAL;
+	}
+
+	dn200_uc_addr_get(priv, vf_id, mac);
+	ether_addr_copy(ivi->mac, mac);
+	DN200_VF_LINK_GET(priv, vf_id, &vf_link_notify);
+	if (priv->vf_link_forced[vf_id] == 0)
+		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
+	else if (vf_link_notify && phy_info->link_status)
+		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
+	else
+		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
+	return ret;
+}
+
+static void dn200_link_notify(struct dn200_priv *priv, int vf_id,
+			      bool up_notify)
+{
+	if (up_notify)
+		DN200_ITR_SYNC_SET(priv->hw, vf_link_list, vf_id, LINK_UP_SET);
+	else
+		DN200_ITR_SYNC_SET(priv->hw, vf_link_list, vf_id,
+				   LINK_DOWN_SET);
+	irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+}
+
+/**
+ * stmamc_ndo_set_vf_link_state
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @link: required link state
+ *
+ * Set the link state of a specified VF, regardless of physical link state
+ **/
+static int dn200_ndo_set_vf_link_state(struct net_device *netdev, int vf_id,
+				       int link)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	if (!priv->plat_ex->pf.registered_vfs) {
+		netdev_warn(priv->dev, "VF not exist\n");
+		return -EAGAIN;
+	} else if (vf_id >= priv->plat_ex->pf.registered_vfs) {
+		netdev_err(priv->dev, "Invalid VF Identifier %d\n", vf_id);
+		return -EINVAL;
+	}
+	switch (link) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		priv->vf_link_forced[vf_id] = false;
+		break;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		priv->vf_link_forced[vf_id] = true;
+		if (!phy_info->link_status) {
+			netdev_warn(priv->dev,
+				    "vf%d can't be up for PF is not link up\n",
+				    vf_id);
+			break;
+		}
+		dn200_link_notify(priv, vf_id, true);
+		break;
+	case IFLA_VF_LINK_STATE_DISABLE:
+		priv->vf_link_forced[vf_id] = true;
+		dn200_link_notify(priv, vf_id, false);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static const struct net_device_ops dn200_netdev_ops = {
+	.ndo_open = dn200_open,
+	.ndo_start_xmit = dn200_xmit,
+	.ndo_stop = dn200_release,
+	.ndo_change_mtu = dn200_change_mtu,
+	.ndo_fix_features = dn200_fix_features,
+	.ndo_set_features = dn200_set_features,
+	.ndo_set_rx_mode = dn200_set_rx_mode,
+	.ndo_tx_timeout = dn200_tx_timeout,
+	.ndo_features_check = dn200_features_check,
+	.ndo_eth_ioctl = dn200_ioctl,
+	.ndo_select_queue = dn200_select_queue,
+	.ndo_poll_controller = dn200_poll_controller,
+	.ndo_set_mac_address = dn200_set_mac_address,
+	.ndo_vlan_rx_add_vid = dn200_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = dn200_vlan_rx_kill_vid,
+	.ndo_get_stats64 = dn200_get_stats64,
+	.ndo_set_vf_mac = dn200_ndo_set_vf_mac,
+	.ndo_get_vf_config = dn200_ndo_get_vf_config,
+	.ndo_set_vf_link_state = dn200_ndo_set_vf_link_state,
+};
+
+static const struct net_device_ops dn200_vf_netdev_ops = {
+	.ndo_open = dn200_open,
+	.ndo_start_xmit = dn200_xmit,
+	.ndo_stop = dn200_release,
+	.ndo_change_mtu = dn200_change_mtu,
+	.ndo_fix_features = dn200_fix_features,
+	.ndo_set_features = dn200_set_features,
+	.ndo_set_rx_mode = dn200_set_rx_mode,
+	.ndo_features_check = dn200_features_check,
+	.ndo_eth_ioctl = dn200_ioctl,
+	.ndo_select_queue = dn200_select_queue,
+	.ndo_poll_controller = dn200_poll_controller,
+	.ndo_set_mac_address = dn200_set_mac_address,
+	.ndo_get_stats64 = dn200_get_stats64,
+};
+
+static void dn200_stop_open_subtask(struct dn200_priv *priv)
+{
+	int retry = 0;
+
+	if (test_bit(DN200_IN_REMOVE, &priv->state))
+		goto result;
+
+	if (test_bit(DN200_DEV_ERR_CLOSE, &priv->state) ||
+				test_bit(DN200_PF_NORMAL_CLOSE, &priv->state)) {
+		if (test_bit(DN200_DOWN, &priv->state))
+			goto result;
+		while (test_and_set_bit(DN200_RESETING, &priv->state)) {
+			usleep_range(1000, 2000);
+			if (retry++ >= 3)
+				return;
+		};
+		rtnl_lock();
+		dev_close(priv->dev);
+		rtnl_unlock();
+		clear_bit(DN200_RESETING, &priv->state);
+	}
+
+	if (test_bit(DN200_PF_NORMAL_OPEN, &priv->state)) {
+		rtnl_lock();
+		dev_open(priv->dev, NULL);
+		rtnl_unlock();
+	}
+result:
+	if (test_bit(DN200_PF_NORMAL_OPEN, &priv->state) ||
+		test_bit(DN200_PF_NORMAL_CLOSE, &priv->state)) {
+		clear_bit(DN200_PF_FLOW_NORMAL_SET, &priv->state);
+
+		if (test_bit(DN200_PF_NORMAL_OPEN, &priv->state))
+			clear_bit(DN200_PF_NORMAL_OPEN, &priv->state);
+
+		if (test_bit(DN200_PF_NORMAL_CLOSE, &priv->state))
+			clear_bit(DN200_PF_NORMAL_CLOSE, &priv->state);
+	}
+}
+
+static void dn200_reset_subtask(struct dn200_priv *priv)
+{
+	int retry = 0;
+	u8 states;
+
+	if (test_bit(DN200_DOWN, &priv->state))
+		return;
+	if (test_bit(DN200_IN_REMOVE, &priv->state))
+		return;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, &states);
+	if (!states)
+		return;
+
+	netif_trans_update(priv->dev);
+	while (test_and_set_bit(DN200_RESETING, &priv->state)) {
+		usleep_range(1000, 2000);
+		if (retry++ >= 3)
+			return;
+	}
+
+	if (test_bit(DN200_ERR_RESET, &priv->state)) {
+		/* sriov pf: when tx timeout or occur dma channel err in open,
+		 * will reset hw to recover
+		 * pure pf: just reset hw in netdev open
+		 * vf: don't run this task and reset hw,
+		 * will notify pf to do it when occur dma channel err or tx timeout
+		 */
+		if (netif_running(priv->dev)) {
+			rtnl_lock();
+			dev_close(priv->dev);
+			rtnl_unlock();
+
+			rtnl_lock();
+			dev_open(priv->dev, NULL);
+			priv->dev->netdev_ops->ndo_set_rx_mode(priv->dev);
+			rtnl_unlock();
+		}
+		clear_bit(DN200_ERR_RESET, &priv->state);
+	}
+
+	clear_bit(DN200_RESETING, &priv->state);
+}
+
+static void dn200_service_task(struct work_struct *work)
+{
+	struct dn200_priv *priv = container_of(work, struct dn200_priv,
+					       service_task);
+
+	if (test_bit(DN200_DEV_ERR_CLOSE, &priv->state) ||
+		test_bit(DN200_PF_NORMAL_CLOSE, &priv->state) ||
+			test_bit(DN200_PF_NORMAL_OPEN, &priv->state))
+		dn200_stop_open_subtask(priv);
+	else if (test_bit(DN200_ERR_RESET, &priv->state))
+		dn200_reset_subtask(priv);
+	else if (test_bit(DN200_MAC_LINK_DOWN, &priv->state))
+		dn200_wq_mac_link_down(priv);
+	if (test_bit(DN200_VF_FLOW_OPEN, &priv->state))
+		dn200_vf_flow_open(priv);
+
+
+}
+
+/**
+ *  dn200_hw_init - Init the MAC device
+ *  @priv: driver private structure
+ *  Description: this function is to configure the MAC device according to
+ *  some platform parameters or the HW capability register. It prepares the
+ *  driver to use either ring or chain modes and to setup either enhanced or
+ *  normal descriptors.
+ */
+static int dn200_hw_init(struct dn200_priv *priv)
+{
+	int ret;
+	bool is_locked = true;
+
+	/* Initialize HW Interface */
+	ret = dn200_hwif_init(priv);
+	if (ret)
+		return ret;
+
+	/* Get the HW capability (new GMAC newer than 3.50a) */
+	priv->hw_cap_support = dn200_check_hw_features_support(priv);
+	if (priv->hw_cap_support) {
+		/* We can override some gmac/dma configuration fields: e.g.
+		 * enh_desc, tx_coe (e.g. that are passed through the
+		 * platform) with the values from the HW capability
+		 * register (if supported).
+		 */
+		priv->plat->enh_desc = priv->dma_cap.enh_desc;
+		if (priv->dma_cap.hash_tb_sz) {
+			priv->hw->multicast_filter_bins =
+			    (BIT(priv->dma_cap.hash_tb_sz) << 5);
+			priv->hw->mcast_bits_log2 =
+			    ilog2(priv->hw->multicast_filter_bins);
+		}
+
+		/* TXCOE doesn't work in thresh DMA mode */
+		priv->plat->tx_coe = priv->dma_cap.tx_coe;
+
+		/* In case of GMAC4 rx_coe is from HW cap register. */
+		priv->plat->rx_coe = priv->dma_cap.rx_coe;
+
+		if (priv->dma_cap.rx_coe_type2)
+			priv->plat->rx_coe = DN200_RX_COE_TYPE2;
+		else if (priv->dma_cap.rx_coe_type1)
+			priv->plat->rx_coe = DN200_RX_COE_TYPE1;
+	} else {
+		dev_info(priv->device,
+			 "No HW DMA feature register supported\n");
+	}
+
+	if (priv->plat->rx_coe) {
+		priv->hw->rx_csum = priv->plat->rx_coe;
+		if (netif_msg_probe(priv))
+			dev_info(priv->device,
+				 "RX Checksum Offload Engine supported\n");
+		if (priv->chip_id < DWMAC_CORE_4_00)
+			if (netif_msg_probe(priv))
+				dev_info(priv->device, "COE Type %d\n",
+					 priv->hw->rx_csum);
+	}
+	if (priv->plat->tx_coe)
+		if (netif_msg_probe(priv))
+			dev_info(priv->device,
+				 "TX Checksum insertion supported\n");
+
+	priv->hw->vlan_fail_q_en = priv->plat->vlan_fail_q_en;
+	priv->hw->vlan_fail_q = priv->plat->vlan_fail_q;
+
+	/* Run HW quirks, if any */
+	if (priv->hwif_quirks) {
+		ret = priv->hwif_quirks(priv);
+		if (ret)
+			return ret;
+	}
+
+	/* Rx Watchdog is available in the COREs newer than the 3.40.
+	 * In some case, for example on bugged HW this feature
+	 * has to be disable and this can be done by passing the
+	 * riwt_off field from the platform.
+	 */
+	if ((priv->chip_id >= DWMAC_CORE_3_50 || priv->plat->has_xgmac) &&
+	    !priv->plat->riwt_off) {
+		priv->use_riwt = 1;
+	}
+
+	if (PRIV_SRIOV_SUPPORT(priv) && !PRIV_IS_VF(priv)) {
+		dn200_sriov_mail_init(priv);
+		/* clear fw lock state and lram lock info */
+		dn200_hw_unlock(priv->hw, &is_locked);
+	}
+	return 0;
+}
+
+static void dn200_napi_add(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 queue, maxq;
+	u8 rx_num = priv->plat_ex->default_rx_queue_num;
+	u8 tx_num = priv->plat_ex->default_tx_queue_num;
+
+	if (PRIV_IS_PUREPF(priv)) {
+		rx_num = priv->plat->rx_queues_to_use;
+		tx_num = priv->plat->tx_queues_to_use;
+	}
+	maxq = max(rx_num, tx_num);
+
+	for (queue = 0; queue < maxq; queue++) {
+		struct dn200_channel *ch = &priv->channel[queue];
+
+		ch->priv_data = priv;
+		ch->index = queue;
+		ch->in_sch = false;
+		spin_lock_init(&ch->lock);
+
+		if (queue < rx_num && queue < tx_num &&
+					 priv->txrx_itr_combined) {
+			netif_napi_add(dev, &ch->agg_napi, dn200_napi_poll_agg);
+		} else {
+			if (queue < rx_num) {
+				netif_napi_add(dev, &ch->rx_napi,
+					       dn200_napi_poll_rx);
+			}
+			if (queue < tx_num) {
+				netif_napi_add(dev, &ch->tx_napi,
+					       dn200_napi_poll_tx);
+			}
+		}
+	}
+}
+
+static void dn200_napi_del(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	u32 queue, maxq;
+	u8 rx_num = priv->plat_ex->default_rx_queue_num;
+	u8 tx_num = priv->plat_ex->default_tx_queue_num;
+
+	if (PRIV_IS_PUREPF(priv)) {
+		rx_num = priv->plat->rx_queues_to_use;
+		tx_num = priv->plat->tx_queues_to_use;
+	}
+	maxq = max(rx_num, tx_num);
+
+	for (queue = 0; queue < maxq; queue++) {
+		struct dn200_channel *ch = &priv->channel[queue];
+
+		if (queue < rx_num &&
+			 queue < tx_num &&
+			 priv->txrx_itr_combined) {
+			netif_napi_del(&ch->agg_napi);
+		} else {
+			if (queue < rx_num)
+				netif_napi_del(&ch->rx_napi);
+			if (queue < tx_num)
+				netif_napi_del(&ch->tx_napi);
+		}
+	}
+}
+
+/**
+ * dn200_disable_all_queues - Disable all queues
+ * @priv: driver private structure
+ */
+static void dn200_disable_all_queues(struct dn200_priv *priv)
+{
+	/* at first disable napi and then delete it */
+	__dn200_disable_all_queues(priv);
+}
+
+/**
+ * dn200_enable_all_queues - Enable all queues
+ * @priv: driver private structure
+ */
+void dn200_enable_all_queues(struct dn200_priv *priv)
+{
+	u32 rx_queues_cnt = priv->plat->rx_queues_to_use;
+	u32 tx_queues_cnt = priv->plat->tx_queues_to_use;
+	u32 maxq = max(rx_queues_cnt, tx_queues_cnt);
+	u32 queue;
+
+	/* add napi to netdev poll list and then enable it */
+	for (queue = 0; queue < maxq; queue++) {
+		struct dn200_channel *ch = &priv->channel[queue];
+
+		if (queue < rx_queues_cnt && queue < tx_queues_cnt &&
+			 priv->txrx_itr_combined) {
+			napi_enable(&ch->agg_napi);
+		} else {
+			if (queue < rx_queues_cnt)
+				napi_enable(&ch->rx_napi);
+			if (queue < tx_queues_cnt)
+				napi_enable(&ch->tx_napi);
+		}
+	}
+}
+
+static bool dn200_tx_queue_clean(struct dn200_priv *priv, u8 queue_index)
+{
+	unsigned long start_time, end_time;
+	u32 value;
+	bool is_in_task = !!test_bit(DN200_IN_TASK, &priv->state);
+
+	queue_index += DN200_TXQ_START_GET(priv->hw);
+	/* max wait 500ms to flush tx queue */
+#define MAX_FLUSH_TIME_NS (500 * 1000 * 1000)
+	start_time = ktime_get_ns();
+	while (true) {
+		value = readl(priv->ioaddr + XGMAC_CH_DESC_CACHE_LVL(queue_index)) & XGMAC_TXLVL;
+		if (value == 0x0)
+			break;
+		if (is_in_task)
+			usleep_range(100, 200);
+		else
+			udelay(10);
+		end_time = ktime_get_ns();
+		if ((end_time - start_time) > MAX_FLUSH_TIME_NS) {
+			netif_info(priv, ifdown, priv->dev,
+				 "TXQ:%d exceed max time:%u cache value = 0x%x\n",
+				 queue_index, MAX_FLUSH_TIME_NS, value);
+			return false;
+		}
+	}
+
+	while (true) {
+		value = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(queue_index)) & 0xffff;
+		if (value == 0x0 || value == 0x100)
+			break;
+		if (is_in_task)
+			usleep_range(100, 200);
+		else
+			udelay(10);
+		end_time = ktime_get_ns();
+		if ((end_time - start_time) > MAX_FLUSH_TIME_NS) {
+			netif_info(priv, ifdown, priv->dev,
+				 "TXQ:%d exceed max time:%u dbg_status = 0x%x\n",
+				 queue_index, MAX_FLUSH_TIME_NS, value);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+int dn200_clean_all_tx_queues(struct dn200_priv *priv, u8 tx_queue_num)
+{
+	int i = 0;
+	bool clean_succ = false;
+	u8 flow_state;
+
+	netif_tx_disable(priv->dev);
+	netif_carrier_off(priv->dev);
+	if (PRIV_IS_VF(priv)) {
+		DN200_ITR_SYNC_GET(priv->hw, vf_flow_state_event,
+				DN200_VF_OFFSET_GET(priv->hw), &flow_state);
+		if (flow_state == FLOW_CLOSE_START)
+			return 0;
+		if (test_bit(DN200_VF_FLOW_CLOSE, &priv->state))
+			return 0;
+	}
+	/* vf do not check tx queue when pf notify vf to sw reset*/
+	for (; i < tx_queue_num; i++) {
+		clean_succ = dn200_tx_queue_clean(priv, i);
+		if (!clean_succ)
+			return -EBUSY;
+	}
+	return 0;
+}
+
+static bool dn200_rx_queue_clean(struct dn200_priv *priv, u8 queue_index)
+{
+	u32 ch_dbg_st = 0;
+	u32 rxdma_fsm_st = 0;
+	u32 mtl_rxq_dbg = 0;
+	u32 mac_dbg = 0, mac_rx_dbg = 0;
+	unsigned long start_time, end_time;
+
+#define RX_MAX_FLUSH_TIME_NS (10 * 1000 * 1000) /* 10ms */
+	start_time = ktime_get_ns();
+	while (true) {
+		ch_dbg_st = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(queue_index));
+		rxdma_fsm_st = ch_dbg_st & XGMAC_RXDMA_FSM_STATE_MASK;
+
+		mtl_rxq_dbg = readl(priv->ioaddr + XGMAC_MTL_RXQ_DEBUG(queue_index));
+		mac_dbg = readl(priv->ioaddr + XGMAC_MAC_DEBUG);
+		mac_rx_dbg = mac_dbg & XGMAC_MAC_RX_FIFO_ACT;
+
+		if ((rxdma_fsm_st == XGMAC_RXDMA_FSM_STATE || rxdma_fsm_st == 0)
+			&& mtl_rxq_dbg == 0
+			&& mac_rx_dbg == 0)
+			break;
+
+		end_time = ktime_get_ns();
+		if ((end_time - start_time) > RX_MAX_FLUSH_TIME_NS) {
+			netdev_info(priv->dev,
+				 "RXQ:%d exceed max flush time:%u, ch_dbg_st:%#x, mtl_rxq_dbg:%#x, mac_rx_dbg:%#x\n",
+				 queue_index, RX_MAX_FLUSH_TIME_NS, ch_dbg_st, mtl_rxq_dbg, mac_rx_dbg);
+			return false;
+		}
+	}
+	return true;
+}
+
+int dn200_clean_all_rx_queues(struct dn200_priv *priv)
+{
+	int queue = 0;
+	bool ret = true;
+
+	for (queue = 0; queue <= DN200_LAST_QUEUE(priv); queue++) {
+		ret = dn200_rx_queue_clean(priv, queue);
+		if (!ret)
+			return -EBUSY;
+	}
+	return 0;
+}
+
+static int dn200_sw_resc_reinit(struct dn200_priv *priv, bool rxp_clean);
+static int dn200_sw_resc_close(struct dn200_priv *priv);
+int dn200_reinit_hwts(struct net_device *dev, bool initial, u32 new_flags)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret = 0;
+	bool need_update;
+	bool need_hw_reset = false;
+	int rx_state = 0;
+
+	rx_state = dn200_mac_rx_get(priv, priv->ioaddr);
+	need_update = netif_running(dev);
+	if (test_and_set_bit(DN200_NET_SUSPENDED, &priv->state))
+		return -EINVAL;
+
+	netif_trans_update(priv->dev);
+	if (need_update) {
+		ret = dn200_sw_resc_close(priv);
+		if (ret < 0) {
+			need_hw_reset = true;
+			dev_close(dev);
+		}
+	}
+
+	priv->eth_priv_flags = new_flags;
+	if (initial) {
+		priv->hwts_rx_en = 1;
+		priv->hwts_tx_en = 1;
+	} else {
+		priv->hwts_rx_en = 0;
+		priv->hwts_tx_en = 0;
+	}
+	if (need_update) {
+		if (need_hw_reset)
+			ret = dev_open(dev, NULL);
+		else
+			ret = dn200_sw_resc_reinit(priv, false);
+	}
+	/* enable mac rx engine state to deal with vf down but pf up */
+	if (rx_state)
+		dn200_mac_rx_set(priv, priv->ioaddr, true);
+	clear_bit(DN200_NET_SUSPENDED, &priv->state);
+
+	return ret;
+}
+
+int dn200_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret = 0;
+	int i = 0;
+	bool need_update;
+
+	need_update = netif_running(dev);
+	if (test_and_set_bit(DN200_NET_SUSPENDED, &priv->state))
+		return -EINVAL;
+
+	netif_trans_update(priv->dev);
+	if (need_update)
+		dev_close(dev);
+
+	priv->plat->rx_queues_to_use = rx_cnt;
+	priv->plat->tx_queues_to_use = tx_cnt;
+	if (!PRIV_IS_VF(priv)) {
+		for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
+			priv->rss.table[i] =
+			    ethtool_rxfh_indir_default(i, rx_cnt);
+	}
+	dn200_rss_configure(priv, priv->hw, &priv->rss,
+			   rx_cnt);
+
+	if (priv->dma_cap.rmon)
+		memset(&priv->mmc, 0, sizeof(struct dn200_counters));
+
+	memset(&priv->xstats, 0, sizeof(struct dn200_extra_stats));
+
+	if (need_update)
+		ret = dev_open(dev, NULL);
+
+	clear_bit(DN200_NET_SUSPENDED, &priv->state);
+	return ret;
+}
+
+static bool dn200_all_rx_queue_fc_act_check(struct dn200_priv *priv)
+{
+	u8 count = priv->plat_ex->rx_queues_total;
+	int i = 0;
+	u32 debug_val = 0;
+	u32 fc_status = 0;
+
+	for (; i < count; i++) {
+		debug_val = readl(priv->ioaddr + XGMAC_MTL_RXQ_DEBUG(i));
+		fc_status = (debug_val & XGMAC_RXQSTS) >> XGMAC_RXQSTS_SHIFT;
+		if (fc_status == XGMAC_FC_OVER_TH || fc_status == XGMAC_FC_QUEUE_FULL)
+			return true;
+	}
+	return false;
+}
+
+int dn200_vf_flow_state_process(struct dn200_priv *priv)
+{
+	int ret = 0;
+	u8 flow_state = 0;
+	struct net_device *dev = priv->dev;
+
+	if (!PRIV_IS_VF(priv))
+		return ret;
+
+	set_bit(DN200_IN_TASK, &priv->state);
+	DN200_ITR_SYNC_GET(priv->hw, vf_flow_state_event,
+			   DN200_VF_OFFSET_GET(priv->hw), &flow_state);
+
+	if (test_bit(DN200_VF_FLOW_CLOSE, &priv->state)) {
+		/*stop flow and free tx/rx related sw resource */
+		if (netif_running(dev)) {
+			dn200_sw_resc_close(priv);
+			dev_dbg(priv->device,
+				"%s, %d, vf funcid:%#x, vf offset:%d have been close cpu_id = %d\n",
+				__func__, __LINE__, priv->plat_ex->funcid,
+				DN200_VF_OFFSET_GET(priv->hw), smp_processor_id());
+		}
+		clear_bit(DN200_VF_FLOW_CLOSE, &priv->state);
+		DN200_VF_UPGRADE_SET(priv, priv->plat_ex->vf_offset, BIT(0));
+	}
+
+	if (test_bit(DN200_VF_FLOW_OPEN_SET, &priv->state)) {
+		if (netif_running(dev)) {
+			dn200_sw_resc_reinit(priv, true);
+			dev_dbg(priv->device,
+				"%s, %d, vf funcid:%#x, vf offset:%d, have been open.\n",
+				__func__, __LINE__, priv->plat_ex->funcid,
+				DN200_VF_OFFSET_GET(priv->hw));
+		}
+		clear_bit(DN200_VF_FLOW_OPEN_SET, &priv->state);
+		DN200_VF_UPGRADE_SET(priv, priv->plat_ex->vf_offset, BIT(0));
+	}
+	clear_bit(DN200_IN_TASK, &priv->state);
+	return ret;
+}
+
+static int dn200_sw_resc_close(struct dn200_priv *priv)
+{
+	int ret = 0;
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 i;
+	u32 chan = 0;
+
+	if (!netif_running(priv->dev))
+		return 0;
+
+	if (test_bit(DN200_DOWN, &priv->state))
+		return 0;
+	/* set vf in stop to forbid get hw lock to set rxp */
+	if (PRIV_IS_VF(priv))
+		set_bit(DN200_VF_IN_STOP, &priv->state);
+	netif_tx_disable(priv->dev);
+	netif_carrier_off(priv->dev);
+	for (i = 0; i < tx_count; i++) {
+		if (priv->tx_queue[i].txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[i].txtimer);
+		if (priv->tx_queue[i].tx_task.func)
+			cancel_work_sync(&priv->tx_queue[i].tx_task);
+		if (priv->tx_queue[chan].poll_txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].poll_txtimer);
+		memset(&priv->tx_queue[chan].poll_txtimer, 0,
+		       sizeof(struct hrtimer));
+		if (priv->tx_queue[chan].poll_tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].poll_tx_task);
+	}
+	for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+		if (priv->rx_queue[chan].poll_rxtimer.function)
+			hrtimer_cancel(&priv->rx_queue[chan].poll_rxtimer);
+		memset(&priv->rx_queue[chan].poll_rxtimer, 0,
+		       sizeof(struct hrtimer));
+		if (priv->rx_queue[chan].poll_rx_task.func)
+			cancel_work_sync(&priv->rx_queue[chan].poll_rx_task);
+	}
+	if (!PRIV_IS_VF(priv))
+		dn200_flow_ctrl(priv, priv->hw, false, FLOW_OFF,
+				 0, priv->plat->tx_queues_to_use);
+	ret = dn200_datapath_close(priv);
+
+	if (ret < 0)
+		return ret;
+	if (test_and_set_bit(DN200_DOWN, &priv->state))
+		return 0;
+	priv->vf_sw_close_flag = true;
+	dn200_stop_all_dma(priv);
+
+	dn200_disable_all_queues(priv);
+	udelay(10);
+	dn200_free_irq(priv->dev, REQ_IRQ_ERR_ALL, 0);
+	if (PRIV_IS_PUREPF(priv))
+		dn200_napi_del(priv->dev);
+	/*reclaim tx resource*/
+	for (i = 0; i < tx_count; i++) {
+		if (priv->tx_queue[i].cur_tx != priv->tx_queue[i].dirty_tx)
+			dn200_sw_tx_clean(priv, 64, i);
+	}
+	free_dma_desc_resources(priv);
+
+	return 0;
+}
+
+static int dn200_sw_resc_reinit(struct dn200_priv *priv, bool rxp_clean)
+{
+	struct dn200_phy_info *phy_info = priv->plat_ex->phy_info;
+	int ret = 0;
+	int chan = 0;
+
+	if (!netif_running(priv->dev))
+		return 0;
+	if (!test_bit(DN200_DOWN, &priv->state))
+		return 0;
+	priv->vf_sw_close_flag = false;
+	ret = dn200_iatu_init(priv);
+	if (ret)
+		return ret;
+	ret = alloc_dma_desc_resources(priv);
+	if (ret) {
+		netdev_err(priv->dev, "%s: DMA descriptors allocation failed\n",
+					__func__);
+		goto dma_desc_error;
+	}
+	ret = init_dma_desc_rings(priv->dev, GFP_KERNEL);
+	if (ret) {
+		netdev_err(priv->dev,
+					"%s: DMA descriptors initialization failed\n",
+					__func__);
+		goto init_error;
+	}
+	dn200_add_rx_iatu2tx(priv);
+	if (PRIV_IS_VF(priv))
+		clear_bit(DN200_VF_IN_STOP, &priv->state);
+	ret = dn200_hw_setup(priv->dev, false);
+	if (ret) {
+		netdev_err(priv->dev, "%s: Hw setup failed\n", __func__);
+		goto init_error;
+	}
+	dn200_init_coalesce(priv);
+	ret = dn200_request_irq(priv->dev);
+	if (ret)
+		goto irq_error;
+	if (PRIV_IS_PUREPF(priv))
+		dn200_napi_add(priv->dev);
+	dn200_enable_all_queues(priv);
+	dn200_enable_all_dma_irq(priv);
+	if (rxp_clean)
+		dn200_eth_reconfig(priv);
+	netif_tx_start_all_queues(priv->dev);
+	if (PRIV_IS_VF(priv))
+		dn200_start_all_dma(priv);
+	if (priv->plat_ex->phy_info->link_status) {
+		if (!PRIV_IS_VF(priv))
+			dn200_wq_mac_link_down(priv);
+		phy_info->mac_ops->mac_link_up(priv, phy_info->phydev, 0,
+						phy_info->phy_interface,
+						phy_info->speed, phy_info->dup,
+						0, 0);
+		netif_carrier_on(priv->dev);
+		linkwatch_fire_event(priv->dev);
+	}
+	clear_bit(DN200_DOWN, &priv->state);
+	return 0;
+irq_error:
+	for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) {
+		if (priv->tx_queue[chan].tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].tx_task);
+		if (priv->tx_queue[chan].txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].txtimer);
+		memset(&priv->tx_queue[chan].txtimer, 0,
+		       sizeof(struct hrtimer));
+		if (priv->tx_queue[chan].poll_tx_task.func)
+			cancel_work_sync(&priv->tx_queue[chan].poll_tx_task);
+		if (priv->tx_queue[chan].poll_txtimer.function)
+			hrtimer_cancel(&priv->tx_queue[chan].poll_txtimer);
+		memset(&priv->tx_queue[chan].poll_txtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) {
+		if (priv->rx_queue[chan].poll_rx_task.func)
+			cancel_work_sync(&priv->rx_queue[chan].poll_rx_task);
+		if (priv->rx_queue[chan].poll_rxtimer.function)
+			hrtimer_cancel(&priv->rx_queue[chan].poll_rxtimer);
+		memset(&priv->rx_queue[chan].poll_rxtimer, 0,
+		       sizeof(struct hrtimer));
+	}
+	dn200_hw_teardown(priv->dev);
+init_error:
+	free_dma_desc_resources(priv);
+dma_desc_error:
+	return ret;
+}
+
+static void dn200_retask(struct work_struct *work)
+{
+	struct dn200_priv *priv;
+
+	priv = container_of(work, struct dn200_priv, retask);
+	ctrl_reset(&priv->plat_ex->ctrl, true);
+
+	if (priv->mii)
+		dn200_xgmac_clock_ctl(priv);
+	dn200_resume(priv->device);
+}
+
+static void dn200_reconfig_task(struct work_struct *work)
+{
+	struct dn200_priv *priv;
+
+	priv = container_of(work, struct dn200_priv, reconfig_task);
+	dn200_eth_reconfig(priv);
+}
+
+static void dn200_vf_process_task(struct work_struct *work)
+{
+	struct dn200_priv *priv = container_of(work, struct dn200_priv,
+					       vf_process_task);
+
+	if (test_bit(DN200_VF_FLOW_STATE_SET, &priv->state)) {
+		dn200_vf_flow_state_process(priv);
+		clear_bit(DN200_VF_FLOW_STATE_SET, &priv->state);
+	}
+
+	if (test_bit(DN200_VF_NOTIFY_PF_RESET, &priv->state)) {
+		dn200_vf_glb_err_rst_notify(priv);
+		clear_bit(DN200_VF_NOTIFY_PF_RESET, &priv->state);
+	}
+}
+
+static void dn200_tx_reset(struct timer_list *t)
+{
+	struct dn200_priv *priv = from_timer(priv, t, reset_timer);
+
+	/* tx abnormal should call global err processing to reset hw */
+	dn200_global_err(priv, DN200_TX_RESET);
+}
+
+static void dn200_reset_vf_rxp(struct dn200_priv *priv, u8 vf_offset)
+{
+	priv->pf_rxp_set |= RXP_CLEAR_VF_RXP;
+	priv->clear_vf_rxp_bitmap |= (1 << vf_offset);
+	if (PRIV_SRIOV_SUPPORT(priv) && !test_bit(DN200_RXP_SETTING, &priv->state))
+		queue_work(priv->wq, &priv->rxp_task);
+	DN200_HEARTBEAT_SET(priv->hw, registered_vf_state, vf_offset,
+			    DN200_VF_REG_STATE_NONE);
+	priv->plat_ex->vf_loss_hb_cnt[vf_offset] = 0;
+}
+
+static void dn200_check_vf_alive(struct dn200_priv *priv)
+{
+	u32 vf_num;
+	u8 last_beat, reg_info, beat;
+
+	for (vf_num = 0; vf_num < priv->plat_ex->pf.registered_vfs; vf_num++) {
+		DN200_HEARTBEAT_GET(priv->hw, registered_vf_state, vf_num,
+				    &reg_info);
+		if (!(reg_info & DN200_VF_REG_STATE_OPENED)) {
+			priv->plat_ex->vf_loss_hb_cnt[vf_num] = 0;
+			/*vf not register now, skip it */
+			continue;
+		}
+		DN200_HEARTBEAT_GET(priv->hw, heartbeat, vf_num, &beat);
+		DN200_HEARTBEAT_GET(priv->hw, last_heartbeat, vf_num,
+				    &last_beat);
+		if (beat == last_beat) {
+			(priv->plat_ex->vf_loss_hb_cnt[vf_num])++;
+			if (priv->plat_ex->vf_loss_hb_cnt[vf_num] >
+			    DN200_MAX_VF_LOSS_HB_CNT) {
+				netdev_warn(priv->dev,
+					    "%s vf %d register but not keep heartbeat beat %d last_beat %d\n",
+					    __func__, vf_num, beat, last_beat);
+				dn200_reset_vf_rxp(priv, vf_num);
+				dn200_stop_vf_dma(priv, vf_num + 1);
+				if (dn200_all_rx_queue_fc_act_check(priv)) {
+					dn200_global_err(priv, DN200_FC_VF_STOP);
+					return;
+				}
+			}
+			continue;
+		}
+		/*VF has returned to normal, clear loss cnt */
+		priv->plat_ex->vf_loss_hb_cnt[vf_num] = 0;
+		last_beat = beat;
+		DN200_HEARTBEAT_SET(priv->hw, last_heartbeat, vf_num,
+				    last_beat);
+	}
+}
+
+static void dn200_vf_tx_clean_ck(struct dn200_priv *priv)
+{
+	u32 tx_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+
+	if (test_bit(DN200_DOWN, &priv->state) || !netif_running(priv->dev))
+		return;
+	for (queue = 0; queue < tx_count; queue++) {
+		struct dn200_tx_queue *tx_q = &priv->tx_queue[queue];
+
+		if (tx_q->cur_tx != tx_q->dirty_tx && (tx_q->task_need_sch || tx_q->txtimer_need_sch)) {
+			tx_q->old_dirty_tx = tx_q->dirty_tx;
+			if (tx_q->old_dirty_tx == tx_q->dirty_tx) {
+				netdev_dbg(priv->dev,
+						"%s queue %d tx_clean need resch:cur_tx %d dirty %d timer %d\n",
+						__func__, queue, tx_q->cur_tx, tx_q->dirty_tx,
+						atomic_read(&tx_q->txtimer_running));
+				netdev_dbg(priv->dev, "task_status %d time_status %d\n",
+					tx_q->task_need_sch, tx_q->txtimer_need_sch);
+				dn200_tx_timer_arm(priv, queue);
+			}
+		}
+	}
+}
+
+static void dn200_vf_heartbeat(struct dn200_priv *priv)
+{
+	u8 last_beat, reg_info, beat, states;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(priv->hw, pf_fw_err_states, &states);
+	if (states) {
+		if (!test_bit(DN200_DEV_ERR_CLOSE, &priv->state))
+			dn200_fw_err_dev_close(priv);
+		return;
+	}
+
+	DN200_HEARTBEAT_GET(priv->hw, registered_vf_state,
+			    priv->plat_ex->vf_offset, &reg_info);
+	DN200_GET_LRAM_MAILBOX_MEMBER(priv->hw, pf_states, &states);
+	if (reg_info == 0 && states != 0) {
+		/*vf self has been clear rxp info by PF, reset vf self */
+		dn200_normal_reset(priv);
+		return;
+	}
+
+	DN200_HEARTBEAT_GET(priv->hw, last_heartbeat, priv->plat_ex->vf_offset,
+			    &last_beat);
+
+	beat = !last_beat;
+	netdev_dbg(priv->dev, "%s vf %d last_beat %d set beat to %d\n",
+		   __func__, priv->plat_ex->vf_offset, last_beat, beat);
+
+	DN200_HEARTBEAT_SET(priv->hw, heartbeat, priv->plat_ex->vf_offset,
+			    beat);
+}
+
+static int dn200_parse_fw_commit_err(struct device *dev, int nic_st, int status)
+{
+	int i;
+	struct fw_cmt_err {
+		int nic_st;
+		int status;
+		char *info;
+	} err_info[] = {
+		{1, 1, "the NIC fw upgrade fail"},
+		{1, 2, "the NIC fw uncompress to mem fail"},
+		// {2, 0, "the NIC fw prepare jump"},
+		{3, 1, "the NIC vu or cli cmd is running"},
+		{3, 2, "the NIC cli cmd unlock fail"},
+		{3, 3, "the NIC wait lock timeout"},
+		{3, 2, "the NIC jump to new fw fail"},
+	};
+
+	for (i = 0; i < ARRAY_SIZE(err_info); i++) {
+		if (err_info[i].nic_st == nic_st && err_info[i].status == status) {
+			dev_err(dev, "[loading fw]%s [nic_st %d, status %d].\n",
+				err_info[i].info, nic_st, status);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+static void dn200_upgrade_timer(struct timer_list *t)
+{
+	struct dn200_priv *priv = from_timer(priv, t, upgrade_timer);
+	u32 flags = 0;
+	u32 status = 0;
+	int i = 0;
+
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return;
+	}
+
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, nic_st, &flags);
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, rsv, &status);
+
+	if (dn200_parse_fw_commit_err(priv->device, flags, status)) {
+		clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		priv->update_fail = true;
+	}
+	if (flags == DN200_STOP_FLAG) {
+		DN200_SET_LRAM_UPGRADE_PF(priv->hw, 1, priv->plat_ex->pf_id);
+		if (!priv->upgrade_time && priv->flag_upgrade)
+			netdev_info(priv->dev, "%s: start fw upgrade......\n", __func__);
+		if (!priv->upgrade_time && !priv->flag_upgrade)
+			netdev_info(priv->dev, "other's port now in upgrade, stop do anything,wait a while......\n");
+		if (!test_bit(DN200_DOWN, &priv->state) && PRIV_PHY_INFO(priv)->link_status) {
+			for (i = 0; i < priv->plat->rx_queues_to_use; i++)
+				dn200_rx_timer_poll(priv, i);
+			for (i = 0; i < priv->plat->tx_queues_to_use; i++)
+				dn200_tx_timer_poll(priv, i);
+		}
+		priv->upgrade_time++;
+		if (priv->upgrade_time > 2000) {  /*give 60s, it means sufficient*/
+			priv->upgrade_time = 0;
+			DN200_SET_LRAM_UPGRADE_PF(priv->hw, 1, priv->plat_ex->pf_id);
+			clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+			return;
+		}
+		mod_timer(&priv->upgrade_timer,
+			jiffies + msecs_to_jiffies(30));
+		return;
+	}
+
+	if (flags == DN200_START_FLAG) {
+		priv->upgrade_time = 0;
+		if (!priv->flag_upgrade)
+			netdev_info(priv->dev, "other's port upgrade success,you can do anything now......\n");
+		clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		DN200_SET_LRAM_UPGRADE_PF_FINISH(priv->hw, 1, priv->plat_ex->pf_id);
+		return;
+	}
+
+	if (flags == DN200_UNFINISH_FLAG)  {
+		priv->upgrade_time = 0;
+		if (priv->flag_upgrade)
+			netdev_info(priv->dev, "[loading fw]dn200 load fw fail, for img copy to flash happened err.\n");
+		if (!priv->flag_upgrade)
+			netdev_info(priv->dev, "[loading fw]dn200 load fw fail, you can start other thing\n");
+		clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		priv->update_fail = true;
+		return;
+	}
+
+	if (flags == DN200_JMP_FAIL_FLAG) {
+		priv->upgrade_time = 0;
+		if (priv->flag_upgrade)
+			netdev_info(priv->dev, "[loading fw]dn200 load fw fail, for jmp new img fail.\n");
+		if (!priv->flag_upgrade)
+			netdev_info(priv->dev, "[loading fw]dn200 load fw fail, you can start other thing\n");
+		clear_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state);
+		priv->update_fail = true;
+		return;
+	}
+	if (test_bit(ADMIN_UP_GRADE_FLAG, &priv->plat_ex->ctrl.admin_state))
+		mod_timer(&priv->upgrade_timer,
+			jiffies + msecs_to_jiffies(100));
+
+}
+
+static void dn200_heartbeat(struct timer_list *t)
+{
+	struct dn200_priv *priv = from_timer(priv, t, keepalive_timer);
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return;
+	if (priv->dev && priv->dev->reg_state == NETREG_RELEASED)
+		return;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return;
+	}
+	/* tx abnormal should call global err processing to reset hw */
+	if (!netif_running(priv->dev)) {
+		mod_timer(&priv->keepalive_timer,
+			  jiffies + msecs_to_jiffies(1000));
+		return;
+	}
+
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		dn200_check_vf_alive(priv);
+		if (!test_bit(DN200_RXP_SETTING, &priv->state) && test_bit(DN200_RXP_NEED_CHECK, &priv->state)) {
+			queue_work(priv->wq, &priv->rxp_task);
+			clear_bit(DN200_RXP_NEED_CHECK, &priv->state);
+		}
+	} else if (PRIV_IS_VF(priv)) {
+		dn200_vf_heartbeat(priv);
+		dn200_vf_tx_clean_ck(priv);
+	}
+
+	mod_timer(&priv->keepalive_timer, jiffies + msecs_to_jiffies(1000));
+}
+
+static u32 dn200_nextpow2(u32 value)
+{
+	value--;
+	value |= value >> 1;
+	value |= value >> 2;
+	value |= value >> 4;
+	value |= value >> 8;
+	value |= value >> 16;
+	value++;
+	return value;
+}
+
+int dn200_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret = 0;
+	int rx_chan = 0;
+	bool need_update;
+	bool need_hw_reset = false;
+	int rx_state = 0;
+
+	rx_state = dn200_mac_rx_get(priv, priv->ioaddr);
+	need_update = netif_running(dev);
+	if (test_and_set_bit(DN200_NET_SUSPENDED, &priv->state))
+		return -EINVAL;
+
+	netif_trans_update(priv->dev);
+	if (need_update) {
+		if (!HW_IS_PUREPF(priv->hw)) {
+			ret = dn200_sw_resc_close(priv);
+			if (ret < 0) {
+				need_hw_reset = true;
+				dev_close(dev);
+			}
+		} else {
+			need_hw_reset = true;
+			dev_close(dev);
+		}
+	}
+	/*for rx_size and tx_size is impossileby 0 */
+	priv->dma_rx_size =
+	    dn200_nextpow2(rx_size) > DMA_MAX_RX_SIZE ? DMA_MAX_RX_SIZE : dn200_nextpow2(rx_size);
+	priv->dma_tx_size =
+	    dn200_nextpow2(tx_size) > DMA_MAX_TX_SIZE ? DMA_MAX_TX_SIZE : dn200_nextpow2(tx_size);
+	if (priv->dma_rx_size < DMA_MIN_RX_SIZE ||
+		 priv->dma_rx_size > DMA_MAX_RX_SIZE ||
+		 priv->dma_tx_size < DMA_MIN_TX_SIZE ||
+		 priv->dma_tx_size > DMA_MAX_TX_SIZE)
+		WARN_ON(1);
+
+	/* The watchdog is enabled after packet collection is complete.
+	 * However, if the descriptor queue is small and the packet collection speed is fast,
+	 * descriptors may be used up before the watchdog interrupt takes effect,
+	 * and packet collection is blocked if no interruption is triggered.
+	 * Therefore, it is necessary to use the packet receiving interrupt aggregation
+	 * to avoid this situation,
+	 * and ensure that rx-frams + 32 is smaller than rx_dma_size
+	 */
+	for (; rx_chan < priv->plat->rx_queues_to_use; rx_chan++) {
+		if ((priv->rx_coal_frames[rx_chan] +
+		     dn200_rx_refill_size(priv)) >= priv->dma_rx_size) {
+			netdev_warn(dev, "change queue %d rx-frames to %d\n",
+				    rx_chan, DN200_RX_MIN_FRAMES);
+			priv->rx_coal_frames[rx_chan] = DN200_RX_MIN_FRAMES;
+		}
+	}
+	if (need_update) {
+		if (need_hw_reset)
+			ret = dev_open(dev, NULL);
+		else
+			ret = dn200_sw_resc_reinit(priv, false);
+	}
+	if (rx_state)
+		dn200_mac_rx_set(priv, priv->ioaddr, true);
+	clear_bit(DN200_NET_SUSPENDED, &priv->state);
+	/* base on new desc ring size to update rx interrupt usec */
+	dn200_rx_itr_usec_update(priv);
+	return ret;
+}
+
+static int dn200_hw_phy_info_init(struct dn200_priv *priv,
+				  struct net_device *ndev)
+{
+	int ret = 0;
+
+	dn200_check_pcs_mode(priv);
+
+	if (priv->hw->pcs != DN200_PCS_TBI &&
+	    priv->hw->pcs != DN200_PCS_RTBI && !priv->plat_ex->has_xpcs) {
+		/* MDIO bus Registration */
+		ret = dn200_mdio_register(ndev);
+		if (ret < 0) {
+			dev_err(priv->device,
+				"%s: MDIO bus (id: %d) registration failed",
+				__func__, priv->plat->bus_id);
+			return ret;
+		}
+	}
+	ret = dn200_phy_info_init(priv->dev, &dn200_phy_mac_ops);
+	if (ret) {
+		netdev_err(ndev, "failed to setup phy (%d)\n", ret);
+		goto error_phy_setup;
+	}
+	return 0;
+
+error_phy_setup:
+	if (priv->hw->pcs != DN200_PCS_TBI && priv->hw->pcs != DN200_PCS_RTBI)
+		dn200_mdio_unregister(ndev);
+	return ret;
+}
+
+static int dn200_sw_phy_info_init(struct dn200_priv *priv,
+				  struct net_device *ndev)
+{
+	dn200_phy_info_init(priv->dev, &dn200_vf_phy_mac_ops);
+	return 0;
+}
+
+static int dn200_phyinfo_init(struct dn200_priv *priv, struct net_device *ndev)
+{
+	if (PRIV_IS_VF(priv))
+		return dn200_sw_phy_info_init(priv, ndev);
+	else
+		return dn200_hw_phy_info_init(priv, ndev);
+}
+
+static int dn200_ntuple_fdir_init(struct dn200_priv *priv)
+{
+	struct dma_features *dma_cap = &priv->dma_cap;
+
+	priv->flow_entries_max = dma_cap->l3l4fnum;
+	priv->fdir_counts = 0;
+	priv->fdir_enties = devm_kcalloc(priv->device,
+					 dma_cap->l3l4fnum,
+					 sizeof(struct dn200_fdir_filter),
+					 GFP_KERNEL);
+	if (!priv->fdir_enties)
+		return -ENOMEM;
+	memset(&priv->fdir_info, 0, sizeof(priv->fdir_info));
+	return 0;
+}
+
+static void dn200_uc_addr_set(struct dn200_priv *priv, int vf_id, u8 *addr)
+{
+	int i = 0;
+
+	for (i = 0; i < ETH_ALEN; i++)
+		writeb(addr[i], LRAM_PRIV_MAC_VF_OFFSET(priv, vf_id, i));
+}
+
+static void dn200_uc_addr_get(struct dn200_priv *priv, int vf_id, u8 *addr)
+{
+	int i = 0;
+
+	for (i = 0; i < ETH_ALEN; i++)
+		addr[i] = readb(LRAM_PRIV_MAC_VF_OFFSET(priv, vf_id, i));
+}
+
+void dn200_vf_mac_change(struct dn200_priv *priv)
+{
+	u8 mac[ETH_ALEN];
+
+	dn200_uc_addr_get(priv, priv->plat_ex->vf_offset, mac);
+	eth_hw_addr_set(priv->dev, mac);
+	dn200_normal_reset(priv);
+}
+
+void dn200_vf_link_set(struct dn200_priv *priv, u8 link_reset)
+{
+	if (link_reset & LINK_UP_SET) {	/*link_up */
+		priv->vf_link_action = LINK_UP_SET;
+	} else if (link_reset & LINK_DOWN_SET) {	/*link_down */
+		priv->vf_link_action = LINK_DOWN_SET;
+	}
+	queue_work(priv->wq, &priv->vf_linkset_task);
+}
+
+static void dn200_xpcs_all_rst(struct dn200_priv *priv)
+{
+	u32 reg_val = 0;
+	u32 i = 0;
+	/*phy_all rst*/
+	for (i = 0; i < priv->plat_ex->total_pfs; i++) {
+		reg_val = readl(priv->ioaddr + XGE_XGMAC_XPCS_SW_RST(i));
+		if (reg_val & BIT(1))
+			continue;
+		writel(reg_val | BIT(1), priv->ioaddr + XGE_XGMAC_XPCS_SW_RST(i));
+	}
+	usleep_range(100, 200);
+
+	/*LD_DN*/
+	for (i = 0; i < priv->plat_ex->total_pfs; i++) {
+		fw_reg_read(&priv->plat_ex->ctrl,
+		 XPCS_REG_BASE + XPCS_REG_OFFSET * i + XPCS_VR_XS_PMA_MP_12G_16G_25G_SRAM,
+			 &reg_val);
+		if (reg_val & BIT(1))
+			continue;
+		reg_val |= BIT(1);
+		fw_reg_write(&priv->plat_ex->ctrl,
+			XPCS_REG_BASE + XPCS_REG_OFFSET * i + XPCS_VR_XS_PMA_MP_12G_16G_25G_SRAM,
+				 reg_val);
+	}
+	usleep_range(100000, 200000);
+}
+
+static ssize_t temp_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct dn200_ctrl_resource *ctrl;
+	struct dn200_priv *priv;
+	struct net_device *ndev = to_net_dev(dev);
+	char temp[64] = {};
+	int ret;
+
+	priv = netdev_priv(ndev);
+	ctrl = &priv->plat_ex->ctrl;
+	ret = dn200_dev_temp_get(ctrl, temp, sizeof(temp));
+	if (ret)
+		return ret;
+
+	return snprintf(buf, sizeof(temp), temp);
+}
+
+void dn200_xgmac_clock_ctl(struct dn200_priv *priv)
+{
+	u32 value = 0;
+
+	value = readl(priv->ioaddr + XGE_TOP_CONFIG_OFFSET +
+			 0x1c + priv->plat_ex->pf_id * 0x50);
+	value &= ~(BIT(2) | BIT(1));
+	writel(value, priv->ioaddr + XGE_TOP_CONFIG_OFFSET +
+			 0x1c + priv->plat_ex->pf_id * 0x50);
+	usleep_range(10000, 20000);
+	if (!PRIV_IS_VF(priv) && !priv->mii && !priv->plat_ex->raid_supported)
+		dn200_xpcs_all_rst(priv);
+	value = readl(priv->ioaddr + XGE_TOP_CONFIG_OFFSET +
+			 0x1c + priv->plat_ex->pf_id * 0x50);
+	value |= (BIT(2) | BIT(1));
+	writel(value, priv->ioaddr + XGE_TOP_CONFIG_OFFSET +
+			 0x1c + priv->plat_ex->pf_id * 0x50);
+	usleep_range(10000, 20000);
+}
+
+/**
+ * dn200_dvr_probe
+ * @device: device pointer
+ * @plat_dat: platform data pointer
+ * @plat_ex: db200 platform data pointer
+ * @res: dn200 resource pointer
+ * Description: this is the main probe function used to
+ * call the alloc_etherdev, allocate the priv structure.
+ * Return:
+ * returns 0 on success, otherwise errno.
+ */
+int dn200_dvr_probe(struct device *device,
+		    struct plat_dn200enet_data *plat_dat,
+		    struct plat_dn200_data *plat_ex,
+		    struct dn200_resources *res)
+{
+	struct net_device *ndev = NULL;
+	struct dn200_priv *priv;
+	u8 macaddr[ETH_ALEN];
+	u32 rxq;
+	int i, ret = 0;
+	int min_mtu = 0;
+	struct dn200_ver dn200_ver_info = {0};
+
+	ndev = devm_alloc_etherdev_mqs(device, sizeof(struct dn200_priv),
+				       MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES);
+	if (!ndev)
+		return -ENOMEM;
+
+	SET_NETDEV_DEV(ndev, device);
+
+	priv = netdev_priv(ndev);
+	memset(priv, 0, sizeof(*priv));
+	priv->device = device;
+	priv->dev = ndev;
+	dn200_set_ethtool_ops(ndev);
+	priv->dma32_iatu_used = false;
+	priv->pause = PAUSE_TIME;
+	priv->plat = plat_dat;
+	priv->plat_ex = plat_ex;
+	priv->ioaddr = res->addr;
+	priv->dev->base_addr = (unsigned long)res->addr;
+	priv->plat->dma_cfg->multi_msi_en = priv->plat->multi_msi_en;
+	priv->dev->irq = res->irq;
+	priv->lpi_irq = res->lpi_irq;
+	priv->sfty_ce_irq = res->sfty_ce_irq;
+	priv->sfty_ue_irq = res->sfty_ue_irq;
+	priv->xpcs_irq = res->xpcs_vec;
+	priv->plat_ex->pf.ioaddr = res->mail;
+	priv->plat_ex->pf.ctrl_addr = res->ctrl_addr;
+	priv->plat_ex->priv_back = priv;
+	ndev->priv_flags |= IFF_UNICAST_FLT;
+	priv->max_usecs = DN200_ITR_MAX_USECS;
+	priv->min_usecs = DN200_ITR_MIN_USECS;
+	priv->txrx_itr_combined = TXRX_ITR_PROCESS_SELF;	/*tx and rx irq process self */
+	priv->speed = 10000;
+	priv->speed_cmd = plat_ex->speed_cmd;
+	priv->dn200_update_ops = &dn200_itr_update_ops;
+	priv->numa_node = dev_to_node(device);
+	priv->flag_upgrade = false;
+	for (i = 0; i < MTL_MAX_RX_QUEUES; i++)
+		priv->rx_irq[i] = res->rx_irq[i];
+	for (i = 0; i < MTL_MAX_TX_QUEUES; i++)
+		priv->tx_irq[i] = res->tx_irq[i];
+
+	if (!plat_ex->is_vf)
+		dn200_get_mac_from_firmware(priv, res);
+
+	if (!is_zero_ether_addr(res->mac)) {
+		eth_hw_addr_set(priv->dev, res->mac);
+		ndev->addr_assign_type = NET_ADDR_PERM;
+		ether_addr_copy(ndev->perm_addr, res->mac);
+	}
+
+	dev_set_drvdata(device, priv->dev);
+	/* Allocate workqueue */
+	priv->wq = alloc_workqueue("%s", WQ_UNBOUND | WQ_MEM_RECLAIM, 1,
+				   "dn200_wq");
+	if (!priv->wq) {
+		dev_err(priv->device, "failed to create workqueue\n");
+		return -ENOMEM;
+	}
+	priv->tx_wq =
+	    alloc_workqueue("%s_tx_wq", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1,
+			    dev_name(priv->device));
+	if (!priv->tx_wq) {
+		dev_err(priv->device, "failed to create txq workqueue\n");
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&priv->service_task, dn200_service_task);
+	INIT_WORK(&priv->reconfig_task, dn200_reconfig_task);
+	INIT_WORK(&priv->vf_process_task, dn200_vf_process_task);
+	INIT_WORK(&priv->vf_linkset_task, dn200_linkset_task);
+	INIT_WORK(&priv->rxp_task, dn200_rxp_task);
+	timer_setup(&priv->reset_timer, dn200_tx_reset, 0);
+	INIT_WORK(&priv->retask, dn200_retask);
+	if (!PRIV_IS_VF(priv))
+		timer_setup(&priv->upgrade_timer, dn200_upgrade_timer, 0);
+	/* Init MAC and get the capabilities */
+	ret = dn200_hw_init(priv);
+	if (ret)
+		goto error_hw_init;
+
+	if (PRIV_SRIOV_SUPPORT(priv))
+		dn200_sriov_ver_set(priv, &dn200_ver_info);
+	/* Only DWMAC core version 5.20 onwards supports HW descriptor prefetch.
+	 */
+	if (priv->chip_id < DWMAC_CORE_5_20)
+		priv->plat->dma_cfg->dche = false;
+
+	dn200_check_ether_addr(priv);
+	if (PRIV_IS_VF(priv)) {
+		ether_addr_copy(macaddr, priv->dev->dev_addr);
+		dn200_uc_addr_set(priv, priv->plat_ex->vf_offset, macaddr);
+		ndev->addr_assign_type = NET_ADDR_SET;
+		ndev->netdev_ops = &dn200_vf_netdev_ops;
+	} else {
+		ndev->netdev_ops = &dn200_netdev_ops;
+	}
+
+	if (!PRIV_IS_VF(priv))
+		ndev->dcbnl_ops = dn200_get_dcbnl_ops();
+	ndev->hw_features = NETIF_F_SG;
+	if (!HW_IS_PUREPF(priv->hw))
+		ndev->features |= NETIF_F_RXCSUM;
+	else
+		ndev->hw_features |= NETIF_F_RXCSUM;
+	ndev->hw_features |= NETIF_F_HW_CSUM;
+	if (priv->plat->tso_en && priv->dma_cap.tsoen) {
+		ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+		priv->tso = true;
+	}
+
+	if (priv->dma_cap.sphen && !priv->plat->sph_disable) {
+		ndev->hw_features |= NETIF_F_GRO;
+		priv->sph_cap = true;
+		priv->sph = priv->sph_cap;
+		dev_info(priv->device, "SPH feature enabled\n");
+	}
+
+	/* The current IP register MAC_HW_Feature1[ADDR64] only define
+	 * 32/40/64 bit width, but some SOC support others like i.MX8MP
+	 * support 34 bits but it map to 40 bits width in MAC_HW_Feature1[ADDR64].
+	 * So overwrite dma_cap.addr64 according to HW real design.
+	 */
+	if (priv->plat->addr64)
+		priv->dma_cap.addr64 = priv->plat->addr64;
+
+	if (priv->plat->addr64 > 32) {
+		/* If more than 32 bits can be addressed, make sure to
+		 * enable enhanced addressing mode.
+		 */
+		if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT))
+			priv->plat->dma_cfg->eame = true;
+	} else {
+		priv->dma_cap.addr64 = 32;
+	}
+	if (PRIV_IS_VF(priv))
+		ndev->watchdog_timeo = msecs_to_jiffies(120 * 1000);
+	else
+		ndev->watchdog_timeo = msecs_to_jiffies(TX_TIMEO);
+	if (priv->dma_cap.rssen && priv->plat->rss_en) {
+		if (!HW_IS_VF(priv->hw))
+			ndev->hw_features |= NETIF_F_RXHASH;
+	}
+
+	if (!HW_IS_VF(priv->hw))
+		ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	ndev->features |= ndev->hw_features | NETIF_F_HIGHDMA;
+
+	if (!HW_IS_VF(priv->hw)) {
+		ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
+		ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+	}
+
+	if (priv->dma_cap.vlins) {
+		ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
+		ndev->features |= NETIF_F_HW_VLAN_CTAG_TX;
+	}
+
+	ndev->vlan_features = ndev->features;
+	if (!HW_IS_VF(priv->hw)) {
+		if (!dn200_ntuple_fdir_init(priv))
+			ndev->hw_features |= NETIF_F_NTUPLE;
+	}
+
+	ndev->hw_features |= NETIF_F_RXALL;
+
+	if (!PRIV_IS_VF(priv)) {
+		/* Add Loopback capability to the device */
+		ndev->hw_features |= NETIF_F_LOOPBACK;
+	}
+	/* Both mac100 and gmac support receive VLAN tag detection */
+	priv->msg_enable = default_msg_level;
+
+	/* Initialize RSS */
+	rxq = priv->plat->rx_queues_to_use;
+	netdev_rss_key_fill(priv->rss.key, sizeof(priv->rss.key));
+	for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
+		priv->rss.table[i] = ethtool_rxfh_indir_default(i, rxq);
+
+	// enable ip 2-tuple,tcp 4-tuple and udp 4-tuple
+	priv->rss.rss_flags =
+	    DN200_RSS_IP2TE | DN200_RSS_UDP4TE | DN200_RSS_TCP4TE;
+
+	dn200_init_ndev_tunnel(ndev);
+	ret = dn200_max_mtu_get(priv, &priv->plat->maxmtu, &min_mtu);
+	if (ret < 0) {
+		dev_err(priv->device, "max_mtu alloc err!\n");
+		goto error_mtu_init;
+	}
+
+	/* MTU range: 68 - hw-specific max */
+	ndev->min_mtu = min_mtu;
+	if (priv->plat->has_xgmac)
+		ndev->max_mtu = XGMAC_JUMBO_LEN;
+	else if ((priv->plat->enh_desc) || (priv->chip_id >= DWMAC_CORE_4_00))
+		ndev->max_mtu = JUMBO_LEN;
+	else
+		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
+	/* Will not overwrite ndev->max_mtu if plat->maxmtu > ndev->max_mtu
+	 * as well as plat->maxmtu < ndev->min_mtu which is a invalid range.
+	 */
+	if (priv->plat->maxmtu < ndev->max_mtu &&
+	    priv->plat->maxmtu >= ndev->min_mtu)
+		ndev->max_mtu = priv->plat->maxmtu;
+	else if (priv->plat->maxmtu < ndev->min_mtu)
+		dev_warn(priv->device,
+			 "%s: warning: maxmtu having invalid value (%d)\n",
+			 __func__, priv->plat->maxmtu);
+	priv->flow_ctrl = FLOW_AUTO; /* RX/TX pause on */
+	if (priv->plat_ex->max_num_vlan)
+		priv->hw->max_vlan_num = priv->plat_ex->max_num_vlan;
+	else
+		priv->hw->max_vlan_num = 0;
+
+	mutex_init(&priv->lock);
+
+	/* If a specific clk_csr value is passed from the platform
+	 * this means that the CSR Clock Range selection cannot be
+	 * changed at run-time and it is fixed. Viceversa the driver'll try to
+	 * set the MDC clock dynamically according to the csr actual
+	 * clock input.
+	 */
+	if (priv->plat->clk_csr >= 0)
+		priv->clk_csr = priv->plat->clk_csr;
+	else
+		dn200_clk_csr_set(priv);
+
+	ret = dn200_phyinfo_init(priv, ndev);
+	if (ret) {
+		dev_err(priv->device, "%s: ERROR %i init phy info\n",
+			__func__, ret);
+		goto error_phy_init;
+	}
+	netif_set_real_num_rx_queues(ndev, priv->plat_ex->default_rx_queue_num);
+	netif_set_real_num_tx_queues(ndev, priv->plat_ex->default_tx_queue_num);
+	dn200_axi_init_for_raid(priv);
+	ret = register_netdev(ndev);
+	if (ret) {
+		dev_err(priv->device, "%s: ERROR %i registering the device\n",
+			__func__, ret);
+		goto error_netdev_register;
+	}
+	netif_carrier_off(ndev);
+
+	if (priv->plat->dump_debug_regs)
+		priv->plat->dump_debug_regs(priv->plat->bsp_priv);
+
+	/* init itr divisor at first, and will be updated when link up */
+	dn200_set_itr_divisor(priv, SPEED_10000);
+
+	dn200_init_fs(ndev);
+	set_bit(DN200_DCB_DOWN, &priv->state);
+	set_bit(DN200_DOWN, &priv->state);
+
+	/* pf/vf shared static info that will not be cleared when dev close/open */
+	dn200_sriov_static_init(priv);
+
+	if (!PRIV_IS_VF(priv)) {
+		priv->temp_attr = kmalloc(sizeof(struct device_attribute), GFP_KERNEL);
+		if (priv->temp_attr) {
+			priv->temp_attr->show = temp_show;
+			priv->temp_attr->store = NULL;
+			priv->temp_attr->attr.name = "temp";
+			priv->temp_attr->attr.mode = 00444;
+			if (sysfs_create_file(&ndev->dev.kobj, &priv->temp_attr->attr)) {
+				dev_info(priv->device, "sysfs_create_file failed.\n");
+				kfree(priv->temp_attr);
+				priv->temp_attr = NULL;
+			}
+		}
+	}
+
+	if (PRIV_IS_VF(priv))
+		DN200_ITR_SYNC_SET(priv->hw, vf_probe, priv->plat_ex->vf_offset, 1);
+
+	if (priv->mii) {
+		dn200_xgmac_clock_ctl(priv);
+		if (priv->mii) {
+			/* set rgmii rx clock from soc */
+			dn200_xgmac_rx_ext_clk_set(priv, false);
+			/* workaround: set phy loopback for reg timeout */
+			mdiobus_write(priv->mii, priv->plat->phy_addr, 0, 0x4140);
+		}
+
+		usleep_range(10000, 20000);
+		dn200_reset(priv);
+		usleep_range(10000, 20000);
+		if (priv->mii)
+			mdiobus_write(priv->mii, priv->plat->phy_addr, 0, 0x1940);
+	} else if (!PRIV_IS_VF(priv)) {
+		dn200_xgmac_clock_ctl(priv);
+		usleep_range(10000, 20000);
+		dn200_reset(priv);
+		usleep_range(10000, 20000);
+	}
+
+	if (!PRIV_IS_PUREPF(priv))
+		dn200_napi_add(ndev);
+
+	if (!PRIV_IS_VF(priv))
+		set_bit(DN200_PROBE_FINISHED, &priv->state);
+	return ret;
+
+error_netdev_register:
+error_phy_init:
+error_hw_init:
+error_mtu_init:
+	destroy_workqueue(priv->wq);
+	destroy_workqueue(priv->tx_wq);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dn200_dvr_probe);
+
+static void dn200_task_stop(struct dn200_priv *priv)
+{
+	del_timer_sync(&priv->reset_timer);
+	if (!PRIV_IS_VF(priv))
+		del_timer_sync(&priv->upgrade_timer);
+	if (priv->retask.func)
+		cancel_work_sync(&priv->retask);
+	if (priv->service_task.func)
+		cancel_work_sync(&priv->service_task);
+	if (priv->vf_process_task.func)
+		cancel_work_sync(&priv->vf_process_task);
+	if (priv->vf_linkset_task.func)
+		cancel_work_sync(&priv->vf_linkset_task);
+}
+
+/**
+ * dn200_dvr_remove
+ * @dev: device pointer
+ * Description: this function resets the TX/RX processes, disables the MAC RX/TX
+ * changes the link status, releases the DMA descriptor rings.
+ */
+int dn200_dvr_remove(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct dn200_priv *priv = netdev_priv(ndev);
+
+	if (PRIV_IS_VF(priv))
+		DN200_ITR_SYNC_SET(priv->hw, vf_probe, priv->plat_ex->vf_offset, 0);
+	else {
+		if (priv->temp_attr) {
+			sysfs_remove_file(&dev->kobj, &priv->temp_attr->attr);
+			kfree(priv->temp_attr);
+			priv->temp_attr = NULL;
+		}
+	}
+
+	if (ndev->reg_state == NETREG_REGISTERED)
+		unregister_netdev(ndev);
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		dn200_phy_info_remove(ndev);
+	if (!PRIV_IS_PUREPF(priv))
+		dn200_napi_del(priv->dev);
+	dn200_axi_uninit_for_raid(priv);
+	if (!priv->fdir_enties)
+		kfree(priv->fdir_enties);
+	dn200_exit_fs(ndev);
+	if (priv->hw->pcs != DN200_PCS_TBI && priv->hw->pcs != DN200_PCS_RTBI)
+		dn200_mdio_unregister(ndev);
+	dn200_task_stop(priv);
+	destroy_workqueue(priv->wq);
+	destroy_workqueue(priv->tx_wq);
+	mutex_destroy(&priv->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dn200_dvr_remove);
+
+/**
+ * dn200_suspend - suspend callback
+ * @dev: device pointer
+ * Description: this is the function to suspend the device and it is called
+ * by the platform driver to stop the network queue, release the resources,
+ * program the PMT register (for WoL), clean and release driver resources.
+ */
+int dn200_suspend(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct dn200_priv *priv = netdev_priv(ndev);
+	struct plat_dn200_data *plat_ex = priv->plat_ex;
+	struct dn200_ctrl_resource *ctrl = &plat_ex->ctrl;
+	int retry = 0;
+
+	if (test_bit(DN200_PCIE_UNAVAILD, &priv->state) || !dn200_hwif_id_check(priv->ioaddr))
+		return -EIO;
+
+	if (test_and_set_bit(DN200_NET_SUSPENDED, &priv->state))
+		return 0;
+
+	netif_trans_update(priv->dev);
+	while (test_and_set_bit(DN200_RESETING, &priv->state)) {
+		usleep_range(1000, 2000);
+		if (retry++ >= 3)
+			return 0;
+	}
+
+	if (ndev && netif_running(ndev)) {
+		rtnl_lock();
+		if (test_bit(DN200_SYS_SUSPENDED, &priv->state))
+			priv->dev->netdev_ops->ndo_stop(priv->dev);
+		else
+			dev_close(priv->dev);
+		rtnl_unlock();
+	}
+
+	if (!test_bit(DN200_SYS_SUSPENDED, &priv->state))
+		return 0;
+
+	if (!plat_ex->use_msi) {
+		synchronize_irq(ctrl->msix_entries[plat_ex->total_irq - 2].vector);
+		devm_free_irq(ctrl->dev,
+			      ctrl->msix_entries[plat_ex->total_irq - 2].vector,
+			      plat_ex);
+		synchronize_irq(ctrl->msix_entries[plat_ex->total_irq - 1].vector);
+		devm_free_irq(ctrl->dev,
+			      ctrl->msix_entries[plat_ex->total_irq - 1].vector,
+			      plat_ex);
+		synchronize_irq(ctrl->msix_entries[0].vector);
+		devm_free_irq(ctrl->dev, ctrl->msix_entries[0].vector, ctrl);
+		pci_disable_msix(to_pci_dev(dev));
+	} else {
+		pci_free_irq_vectors(to_pci_dev(dev));
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dn200_suspend);
+
+/**
+ * dn200_resume - resume callback
+ * @dev: device pointer
+ * Description: when resume this function is invoked to setup the DMA and CORE
+ * in a usable state.
+ */
+int dn200_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct dn200_priv *priv = netdev_priv(ndev);
+	struct dn200_resources res;
+	int i;
+
+	if (test_bit(DN200_PCIE_UNAVAILD, &priv->state) || !dn200_hwif_id_check(priv->ioaddr))
+		return -EIO;
+
+	if (test_bit(DN200_SYS_SUSPENDED, &priv->state)) {
+		memset(&res, 0, sizeof(res));
+
+		res.addr = priv->ioaddr;
+
+		irq_info_pfvf_release(pdev, &priv->plat_ex->ctrl, true);
+		dn200_config_interrupt(pdev, priv->plat, priv->plat_ex, &res, !priv->plat_ex->use_msi);
+
+		priv->dev->irq = res.irq;
+		priv->lpi_irq = res.lpi_irq;
+		priv->sfty_ce_irq = res.sfty_ce_irq;
+		priv->sfty_ue_irq = res.sfty_ue_irq;
+		priv->xpcs_irq = res.xpcs_vec;
+		for (i = 0; i < MTL_MAX_RX_QUEUES; i++)
+			priv->rx_irq[i] = res.rx_irq[i];
+		for (i = 0; i < MTL_MAX_TX_QUEUES; i++)
+			priv->tx_irq[i] = res.tx_irq[i];
+		dn200_axi_init_for_raid(priv);
+	}
+
+	if (!test_and_clear_bit(DN200_NET_SUSPENDED, &priv->state))
+		return 0;
+
+	dn200_hw_sideband_init(PRIV_PHY_INFO(priv));
+	rtnl_lock();
+	if (test_bit(DN200_SYS_SUSPENDED, &priv->state)) {
+		clear_bit(DN200_SYS_SUSPENDED, &priv->state);
+		priv->dev->netdev_ops->ndo_open(priv->dev);
+	} else {
+		dev_open(priv->dev, NULL);
+	}
+	rtnl_unlock();
+	clear_bit(DN200_DOWN, &priv->state);
+	clear_bit(DN200_RESETING, &priv->state);
+	clear_bit(DN200_SYS_SUSPENDED, &priv->state);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dn200_resume);
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_mdio.c b/drivers/net/ethernet/dapustor/dn200/dn200_mdio.c
new file mode 100644
index 000000000000..6ea32507253c
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_mdio.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/gpio/consumer.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/mii.h>
+#include <linux/of_mdio.h>
+#include <linux/phy.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+
+#include "dwxgmac_comm.h"
+#include "dn200.h"
+
+#define MII_BUSY 0x00000001
+#define MII_WRITE 0x00000002
+#define MII_DATA_MASK GENMASK(15, 0)
+
+/* GMAC4 defines */
+#define MII_GMAC4_GOC_SHIFT 2
+#define MII_GMAC4_REG_ADDR_SHIFT 16
+#define MII_GMAC4_WRITE BIT(MII_GMAC4_GOC_SHIFT)
+#define MII_GMAC4_READ (3 << MII_GMAC4_GOC_SHIFT)
+#define MII_GMAC4_C45E BIT(1)
+
+/* XGMAC defines */
+#define MII_XGMAC_SADDR BIT(18)
+#define MII_XGMAC_CMD_SHIFT 16
+#define MII_XGMAC_WRITE BIT(MII_XGMAC_CMD_SHIFT)
+#define MII_XGMAC_READ (3 << MII_XGMAC_CMD_SHIFT)
+#define MII_XGMAC_BUSY BIT(22)
+/* Extend max C22_address or phy_address from 3 to 15 */
+#define MII_XGMAC_MAX_C22ADDR 31
+#define MII_XGMAC_C22P_MASK GENMASK(MII_XGMAC_MAX_C22ADDR, 0)
+#define MII_XGMAC_PA_SHIFT 16
+#define MII_XGMAC_DA_SHIFT 21
+#define MII_DEVADDR_C45_SHIFT 16
+#define MII_REGADDR_C45_MASK	GENMASK(15, 0)
+static DEFINE_MUTEX(dn200_mdio_mutex);
+
+static void dn200_xgmac2_c45_format(struct dn200_priv *priv, int phyaddr,
+				    int devad, int phyreg, u32 *hw_addr)
+{
+	u32 tmp;
+
+	/* Set port as Clause 45 */
+	tmp = readl(priv->ioaddr + XGMAC_MDIO_C22P);
+	tmp &= ~BIT(phyaddr);
+	writel(tmp, priv->ioaddr + XGMAC_MDIO_C22P);
+
+	*hw_addr = (phyaddr << MII_XGMAC_PA_SHIFT) | (phyreg & 0xffff);
+	*hw_addr |= (phyreg >> MII_DEVADDR_C45_SHIFT) << MII_XGMAC_DA_SHIFT;
+}
+
+static void dn200_xgmac2_c22_format(struct dn200_priv *priv, int phyaddr,
+				    int phyreg, u32 *hw_addr)
+{
+	u32 tmp;
+	/* Set port as Clause 22 */
+	tmp = readl(priv->ioaddr + XGMAC_MDIO_C22P);
+	tmp &= ~MII_XGMAC_C22P_MASK;
+	tmp |= BIT(phyaddr);
+	writel(tmp, priv->ioaddr + XGMAC_MDIO_C22P);
+
+	*hw_addr = (phyaddr << MII_XGMAC_PA_SHIFT) | (phyreg & 0x1f);
+}
+
+static void dn200_mdio_bus_channel_set(struct dn200_priv *priv)
+{
+	u32 mdc_map, mdio_map;
+
+	/* [8:0]mdc_map: 136 + xge_index * 22 */
+	mdc_map = (priv->plat_ex->funcid * 22 + 136) & GENMASK(8, 0);
+	/* [24:16]mdio_map: 137 + xge_index * 22 */
+	mdio_map = ((priv->plat_ex->funcid * 22 + 137) << 16) & GENMASK(24, 16);
+	writel(mdc_map | mdio_map, priv->ioaddr + XGMAC_MDIO_CHANNEL);
+}
+
+static int dn200_xgmac2_mdio_read(struct dn200_priv *priv,
+	int phyaddr, int phyreg, u32 value)
+{
+	unsigned int mii_address = priv->hw->mii.addr;
+	unsigned int mii_data = priv->hw->mii.data;
+	u32 tmp, addr;
+	int ret;
+
+	mutex_lock(&dn200_mdio_mutex);
+	dn200_mdio_bus_channel_set(priv);
+
+	/* Wait until any existing MII operation is complete */
+	if (readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+			       !(tmp & MII_XGMAC_BUSY), 100, 10000)) {
+		ret = -EBUSY;
+		goto err_disable_clks;
+	}
+
+	if (!(value & MII_XGMAC_SADDR))
+		dn200_xgmac2_c45_format(priv, phyaddr, 0, phyreg, &addr);
+	else
+		dn200_xgmac2_c22_format(priv, phyaddr, phyreg, &addr);
+
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+	    & priv->hw->mii.clk_csr_mask;
+	value |= MII_XGMAC_READ;
+
+	/* Wait until any existing MII operation is complete */
+	if (readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+			       !(tmp & MII_XGMAC_BUSY), 100, 10000)) {
+		ret = -EBUSY;
+		goto err_disable_clks;
+	}
+
+	/* Set the MII address register to read */
+	writel(addr, priv->ioaddr + mii_address);
+	writel(value, priv->ioaddr + mii_data);
+
+	/* Wait until any existing MII operation is complete */
+	if (readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+			       !(tmp & MII_XGMAC_BUSY), 100, 10000)) {
+		ret = -EBUSY;
+		goto err_disable_clks;
+	}
+
+	/* Read the data from the MII data register */
+	ret = (int)readl(priv->ioaddr + mii_data) & GENMASK(15, 0);
+
+err_disable_clks:
+	mutex_unlock(&dn200_mdio_mutex);
+	return ret;
+}
+
+static int dn200_xgmac2_mdio_read_c22(struct mii_bus *bus, int phyaddr,
+				      int phyreg)
+{
+	struct net_device *ndev = bus->priv;
+	struct dn200_priv *priv;
+
+	priv = netdev_priv(ndev);
+
+	/* HW does not support C22 addr >= 4 */
+	if (phyaddr > MII_XGMAC_MAX_C22ADDR)
+		return -ENODEV;
+
+	return dn200_xgmac2_mdio_read(priv, phyaddr, phyreg,
+		MII_XGMAC_BUSY | MII_XGMAC_SADDR);
+}
+
+static int dn200_xgmac2_mdio_read_c45(struct mii_bus *bus, int phyaddr,
+				      int devad, int phyreg)
+{
+	struct net_device *ndev = bus->priv;
+	struct dn200_priv *priv;
+
+	priv = netdev_priv(ndev);
+	return dn200_xgmac2_mdio_read(priv, phyaddr, phyreg, MII_XGMAC_BUSY);
+}
+
+static int dn200_xgmac2_mdio_write(struct dn200_priv *priv,
+	int phyaddr, int phyreg, u32 value, u16 phydata)
+{
+	unsigned int mii_address = priv->hw->mii.addr;
+	unsigned int mii_data = priv->hw->mii.data;
+	u32 tmp, addr;
+	int ret;
+
+	mutex_lock(&dn200_mdio_mutex);
+	dn200_mdio_bus_channel_set(priv);
+
+	/* Wait until any existing MII operation is complete */
+	if (readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+			       !(tmp & MII_XGMAC_BUSY), 100, 10000)) {
+		ret = -EBUSY;
+		goto err_disable_clks;
+	}
+
+	if (!(value & MII_XGMAC_SADDR))
+		dn200_xgmac2_c45_format(priv, phyaddr, 0, phyreg, &addr);
+	else
+		dn200_xgmac2_c22_format(priv, phyaddr, phyreg, &addr);
+
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+	    & priv->hw->mii.clk_csr_mask;
+	value |= phydata;
+	value |= MII_XGMAC_WRITE;
+
+	/* Wait until any existing MII operation is complete */
+	if (readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+			       !(tmp & MII_XGMAC_BUSY), 100, 10000)) {
+		ret = -EBUSY;
+		goto err_disable_clks;
+	}
+
+	/* Set the MII address register to write */
+	writel(addr, priv->ioaddr + mii_address);
+	writel(value, priv->ioaddr + mii_data);
+
+	/* Wait until any existing MII operation is complete */
+	ret = readl_poll_timeout(priv->ioaddr + mii_data, tmp,
+				 !(tmp & MII_XGMAC_BUSY), 100, 10000);
+
+err_disable_clks:
+	mutex_unlock(&dn200_mdio_mutex);
+	return ret;
+}
+
+static int dn200_xgmac2_mdio_write_c22(struct mii_bus *bus, int phyaddr,
+				       int phyreg, u16 phydata)
+{
+	struct net_device *ndev = bus->priv;
+	struct dn200_priv *priv;
+
+	priv = netdev_priv(ndev);
+
+	/* HW does not support C22 addr >= 4 */
+	if (phyaddr > MII_XGMAC_MAX_C22ADDR)
+		return -ENODEV;
+
+	return dn200_xgmac2_mdio_write(priv, phyaddr, phyreg,
+		MII_XGMAC_BUSY | MII_XGMAC_SADDR, phydata);
+}
+
+static int dn200_xgmac2_mdio_write_c45(struct mii_bus *bus, int phyaddr,
+				       int devad, int phyreg, u16 phydata)
+{
+	struct net_device *ndev = bus->priv;
+	struct dn200_priv *priv;
+
+	priv = netdev_priv(ndev);
+	return dn200_xgmac2_mdio_write(priv, phyaddr, devad, MII_XGMAC_BUSY, phydata);
+}
+
+
+/**
+ * dn200_mdio_reset
+ * @bus: points to the mii_bus structure
+ * Description: reset the MII bus
+ */
+int dn200_mdio_reset(struct mii_bus *bus)
+{
+	return 0;
+}
+
+/**
+ * dn200_mdio_register
+ * @ndev: net device structure
+ * Description: it registers the MII bus
+ */
+int dn200_mdio_register(struct net_device *ndev)
+{
+	int err = 0;
+	struct mii_bus *new_bus;
+	struct dn200_priv *priv = netdev_priv(ndev);
+	struct dn200_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
+	struct device_node *mdio_node = priv->plat->mdio_node;
+	struct device *dev = ndev->dev.parent;
+	int addr, found, max_addr;
+	int i = 0;
+
+	if (!mdio_bus_data)
+		return 0;
+
+	new_bus = mdiobus_alloc();
+	if (!new_bus)
+		return -ENOMEM;
+
+	if (mdio_bus_data->irqs) {
+		while (mdio_bus_data->irqs[i] && (i < sizeof(new_bus->irq))) {
+			new_bus->irq[i] = mdio_bus_data->irqs[i];
+			i++;
+		}
+	}
+
+	new_bus->name = "dn200";
+
+	if (priv->plat->has_xgmac) {
+		new_bus->read = &dn200_xgmac2_mdio_read_c22;
+		new_bus->write = &dn200_xgmac2_mdio_write_c22;
+		new_bus->read_c45 = &dn200_xgmac2_mdio_read_c45;
+		new_bus->write_c45 = &dn200_xgmac2_mdio_write_c45;
+		/* Right now only C22 phys are supported */
+		max_addr = MII_XGMAC_MAX_C22ADDR + 1;
+
+		/* Check if DT specified an unsupported phy addr */
+		if (priv->plat->phy_addr > MII_XGMAC_MAX_C22ADDR)
+			dev_err(dev, "Unsupported phy_addr (max=%d)\n",
+				MII_XGMAC_MAX_C22ADDR);
+	} else {
+		goto bus_register_fail;
+	}
+
+	if (mdio_bus_data->needs_reset)
+		new_bus->reset = &dn200_mdio_reset;
+	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%s-%x-%x-%x",
+		 new_bus->name, pci_domain_nr(priv->plat_ex->pdev->bus),
+		 priv->plat_ex->pdev->bus->number, priv->plat->bus_id);
+	new_bus->priv = ndev;
+	/* ignore phy addr 0 */
+	if (mdio_bus_data->phy_mask)
+		new_bus->phy_mask = mdio_bus_data->phy_mask;
+	else
+		new_bus->phy_mask = 0x1;
+	new_bus->parent = priv->device;
+
+	err = mdiobus_register(new_bus);
+	if (err != 0) {
+		dev_err(dev, "Cannot register the MDIO bus\n");
+		goto bus_register_fail;
+	}
+
+	/* Looks like we need a dummy read for XGMAC only and C45 PHYs */
+	if (priv->plat->has_xgmac)
+		dn200_xgmac2_mdio_read_c45(new_bus, 0, 0, 0);
+
+	if (priv->plat->phy_node || mdio_node)
+		goto bus_register_done;
+
+	found = 0;
+	for (addr = 0; addr < max_addr; addr++) {
+		struct phy_device *phydev = mdiobus_get_phy(new_bus, addr);
+
+		if (!phydev)
+			continue;
+
+		/* If an IRQ was provided to be assigned after
+		 * the bus probe, do it here.
+		 */
+		if (!mdio_bus_data->irqs &&
+		    mdio_bus_data->probed_phy_irq > 0) {
+			new_bus->irq[addr] = mdio_bus_data->probed_phy_irq;
+			phydev->irq = mdio_bus_data->probed_phy_irq;
+		}
+
+		/* If we're going to bind the MAC to this PHY bus,
+		 * and no PHY number was provided to the MAC,
+		 * use the one probed here.
+		 */
+		if (priv->plat->phy_addr == -1)
+			priv->plat->phy_addr = addr;
+
+		// phy_attached_info(phydev);
+		found = 1;
+	}
+
+	if (!found && !mdio_node) {
+		dev_warn(dev, "No PHY found\n");
+		err = -ENODEV;
+		goto no_phy_found;
+	}
+
+bus_register_done:
+	priv->mii = new_bus;
+
+	return 0;
+
+no_phy_found:
+	mdiobus_unregister(new_bus);
+bus_register_fail:
+	mdiobus_free(new_bus);
+	return err;
+}
+
+/**
+ * dn200_mdio_unregister
+ * @ndev: net device structure
+ * Description: it unregisters the MII bus
+ */
+int dn200_mdio_unregister(struct net_device *ndev)
+{
+	struct dn200_priv *priv = netdev_priv(ndev);
+
+	if (!priv->mii)
+		return 0;
+
+	mdiobus_unregister(priv->mii);
+	priv->mii->priv = NULL;
+	mdiobus_free(priv->mii);
+	priv->mii = NULL;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_pci.c b/drivers/net/ethernet/dapustor/dn200/dn200_pci.c
new file mode 100644
index 000000000000..c9362b0c1bcc
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_pci.c
@@ -0,0 +1,1602 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/pci.h>
+#include <linux/dmi.h>
+#include <linux/msi.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/aer.h>
+#include "dn200.h"
+#include "dn200_self.h"
+#include "linux/delay.h"
+#include "dn200_sriov.h"
+#include "dn200_ctrl.h"
+#include "dn200_prod.h"
+
+#define DRV_SUMMARY	"DapuStor(R) Ethernet Connection DN200 Series Linux Driver"
+static const char dn200_driver_str[] = DRV_SUMMARY;
+static const char dn200_copyright[] = "Copyright (c) 2024, DapuStor Corporation.";
+
+MODULE_AUTHOR("DapuStor Corporation, <fae@dapustor.com>");
+MODULE_DESCRIPTION(DRV_SUMMARY);
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+static int queue_max_set = 8;
+module_param(queue_max_set, int, 0644);
+MODULE_PARM_DESC(queue_max_set,
+		 "PF0 and PF1 channel number set.range from 1 to 8");
+
+#define XGE_NUM 4
+#define DN200_PCI_BAR_NUM 6
+
+struct pcb_type dn200_pcb_type[] = {
+	{ 0b000, DN200_HW_PCB_TYPE_3 },
+	{ 0b001, DN200_HW_PCB_TYPE_0 },
+	{ 0b010, DN200_HW_PCB_TYPE_2 },
+	{ 0b011, DN200_HW_PCB_TYPE_1 },
+};
+
+static void dn200_set_clk_conf(struct dn200_ctrl_resource *ctrl, u32 off,
+			       u32 val)
+{
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (plat_ex->speed_cmd)
+		fw_reg_write(ctrl, 0x24300000 + off, val);
+	else
+		writel(val, plat_ex->io_addr + DN200_PCIE_BAROFF + off);
+}
+
+static void dn200_get_clk_conf(struct dn200_ctrl_resource *ctrl, u32 off,
+			       u32 *val)
+{
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	if (plat_ex->speed_cmd)
+		fw_reg_read(ctrl, 0x24300000 + off, val);
+	else
+		*val = readl(plat_ex->io_addr + DN200_PCIE_BAROFF + off);
+}
+
+/*xgmac use PLLA*/
+static bool dn200_clock_phy_detect(struct pci_dev *pdev,
+				   struct dn200_ctrl_resource *ctrl)
+{
+	u32 val_refa0 = 0, val_refa1 = 0;
+	int retry = 0;
+	int val_judge = 0;
+	u32 val[2];
+	struct plat_dn200_data *plat_ex;
+
+	plat_ex = container_of(ctrl, struct plat_dn200_data, ctrl);
+	dn200_get_clk_conf(ctrl, PCIE_PHY0_REFA_CLKDET_EN_REG, &val[0]);
+	dn200_get_clk_conf(ctrl, PCIE_PHY1_REFA_CLKDET_EN_REG, &val[1]);
+
+	dn200_set_clk_conf(ctrl, PCIE_PHY0_REFA_CLKDET_EN_REG, val[0] | 0x1);
+	dn200_set_clk_conf(ctrl, PCIE_PHY1_REFA_CLKDET_EN_REG, val[1] | 0x1);
+	do {
+		dn200_get_clk_conf(ctrl, PCIE_PHY0_REFA_CLKDET_RESULT_REG,
+				&val_refa0);
+		if (plat_ex->raid_supported) {
+			if (!val_refa0) {
+				val_judge = 0;
+			} else {
+				val_judge = 1;
+				break;
+			}
+		} else {
+			dn200_get_clk_conf(ctrl, PCIE_PHY1_REFA_CLKDET_RESULT_REG,
+				&val_refa1);
+			if (!val_refa0 || !val_refa1)
+				val_judge = 0;
+			else {
+				val_judge = 1;
+				break;
+			}
+		}
+		usleep_range(100, 200);
+		retry++;
+	} while (retry < 200);
+
+	dn200_set_clk_conf(ctrl, PCIE_PHY0_REFA_CLKDET_EN_REG, val[0]);
+	dn200_set_clk_conf(ctrl, PCIE_PHY1_REFA_CLKDET_EN_REG, val[1]);
+
+	if (val_judge)
+		return 1;
+
+	dev_err(&pdev->dev, "func = %s, phy clock not found\n", __func__);
+	return 0;
+}
+
+struct dn200_pci_info {
+	int (*setup)(struct pci_dev *pdev, struct plat_dn200enet_data *plat,
+		     struct plat_dn200_data *plat_ex);
+};
+
+static int dn200_xgmac_gpio_data_set(struct pci_dev *pdev,
+				     struct plat_dn200_data *plat_ex,
+				     const struct dn200_gpio_data *gpio_data,
+				     u8 off)
+{
+	struct dn200_gpio_data *gpio;
+
+	plat_ex->gpio_data =
+	    devm_kzalloc(&pdev->dev, sizeof(struct dn200_gpio_data),
+			 GFP_KERNEL);
+	if (!plat_ex->gpio_data)
+		return -ENOMEM;
+
+	gpio = plat_ex->gpio_data;
+	gpio->gpio_addr_offset = gpio_data[off].gpio_addr_offset;
+	gpio->sfp_detect_pin = gpio_data[off].sfp_detect_pin;
+	gpio->sfp_tx_disable_pin = gpio_data[off].sfp_tx_disable_pin;
+	gpio->sfp_tx_fault_pin = gpio_data[off].sfp_tx_fault_pin;
+	gpio->sfp_rx_los_pin = gpio_data[off].sfp_rx_los_pin;
+	gpio->sfp_rs0_pin = gpio_data[off].sfp_rs0_pin;
+	gpio->sfp_rs1_pin = gpio_data[off].sfp_rs1_pin;
+	gpio->sfp_led1_pin = gpio_data[off].sfp_led1_pin;
+	gpio->sfp_led2_pin = gpio_data[off].sfp_led2_pin;
+	gpio->reg_off_set_write = gpio_data[off].reg_off_set_write;
+	gpio->reg_off_set_read = gpio_data[off].reg_off_set_read;
+	return 0;
+}
+
+static void dn200_xgmac_private_data_set(struct pci_dev *pdev,
+					 struct plat_dn200enet_data *plat,
+					 struct plat_dn200_data *plat_ex,
+					 const struct xge_private_data *xge_data,
+					 const struct xge_link_config *link_config)
+{
+
+	plat->phy_addr = xge_data->phy_addr;
+	plat->bus_id = xge_data->bus_id;
+
+	plat->clk_ptp_rate = link_config->clk_ptp_rate;	// 250MHz for XGMAC, 50MHz for GMAC
+	plat->clk_csr = link_config->clk_csr;	// DN200_CSR_I_4
+	plat->phy_interface = link_config->phy_interface;
+	plat->max_speed = link_config->max_speed;
+	plat_ex->max_speed = link_config->max_speed;
+	plat->clk_ref_rate = link_config->clk_ref_rate;
+	plat_ex->has_xpcs = link_config->has_xpcs;
+	switch (plat->bus_id) {
+	case 1:
+	case 2:
+		plat->tx_queues_to_use = queue_max_set;
+		plat->rx_queues_to_use = queue_max_set;
+		plat_ex->rx_queues_reserved =
+		    xge_data->rx_queues_total - queue_max_set;
+		plat_ex->tx_queues_reserved =
+		    xge_data->tx_queues_total - queue_max_set;
+		plat_ex->max_vfs = plat_ex->tx_queues_reserved;
+		break;
+	case 3:
+	case 4:
+		if (plat_ex->sriov_supported) {
+			plat->tx_queues_to_use =
+			    xge_data->tx_queues_total -
+			    xge_data->tx_queues_reserved;
+			plat->rx_queues_to_use =
+			    xge_data->rx_queues_total -
+			    xge_data->rx_queues_reserved;
+		} else {
+			plat->tx_queues_to_use = xge_data->tx_queues_to_use;
+			plat->rx_queues_to_use = xge_data->rx_queues_to_use;
+		}
+		plat_ex->rx_queues_reserved = xge_data->rx_queues_reserved;
+		plat_ex->tx_queues_reserved = xge_data->tx_queues_reserved;
+		plat_ex->max_vfs = xge_data->max_vfs;
+		break;
+	default:
+		dev_err(&pdev->dev, "Invalid bus id %x\n", plat->bus_id);
+		break;
+	}
+	plat_ex->tx_queues_total = xge_data->tx_queues_total;
+	plat_ex->rx_queues_total = xge_data->rx_queues_total;
+	plat_ex->rx_used_mtl_queues = xge_data->rx_used_mtl_queues;
+	plat_ex->default_rx_queue_num = plat->rx_queues_to_use;
+	plat_ex->default_tx_queue_num = plat->tx_queues_to_use;
+	/*init address bits limit(e.g. 40, 32) and forbidden bits for different driver or product */
+	if (!(plat_ex->is_vf || plat_ex->sriov_supported)) {
+		plat_ex->addr_bits_limit =
+		    addr_limit[DRV_PURE_PF].addr_bits_limit;
+		plat_ex->addr_forbid_bits =
+		    addr_limit[DRV_PURE_PF].addr_forbid_bits;
+	} else if (plat_ex->sriov_supported) {
+		plat_ex->addr_bits_limit =
+		    addr_limit[DRV_SRIOV_PF].addr_bits_limit;
+		plat_ex->addr_forbid_bits =
+		    addr_limit[DRV_SRIOV_PF].addr_forbid_bits;
+	} else {
+		plat_ex->addr_bits_limit = addr_limit[DRV_VF].addr_bits_limit;
+		plat_ex->addr_forbid_bits = addr_limit[DRV_VF].addr_forbid_bits;
+	}
+
+}
+
+static int dn200_vf_plat_info_set(struct pci_dev *pdev,
+				  struct plat_dn200enet_data *plat,
+				  struct plat_dn200_data *plat_ex)
+{
+	struct dn200_vf_info info;
+
+	dn200_get_vf_queue_info(pcim_iomap_table(pdev)[SRIOV_LRAM_BAR_OFF] +
+				0x10000, &info, plat_ex->pf_id,
+				plat_ex->vf_offset);
+	plat_ex->tx_queue_start = info.tx_queue_start;
+	plat_ex->rx_queue_start = info.rx_queue_start;
+	plat->rx_queues_to_use = info.rx_queues_num;
+	plat->tx_queues_to_use = info.tx_queues_num;
+	plat_ex->default_rx_queue_num = info.rx_queues_num;
+	plat_ex->default_tx_queue_num = info.tx_queues_num;
+	plat_ex->max_vfs = info.max_vfs;
+	plat_ex->pf.registered_vfs = info.registered_vfs;
+	plat_ex->pf.vlan_num_per_vf = info.max_vlan_num;
+	plat_ex->max_num_vlan = info.max_vlan_num;
+	plat->rss_en = 0;
+	return 0;
+}
+
+static int dn200_xge_info_set(struct pci_dev *pdev,
+			      struct plat_dn200enet_data *plat,
+			      struct plat_dn200_data *plat_ex)
+{
+	int i = 0, j = 0;
+	int max_pf;
+	int found_pf = 0;	/*0: vf device; 1: pf device */
+	int found_vf = 0;	/*1: vf device; 0: pf device */
+	int dev_func = plat_ex->funcid;
+	int ret = 0;
+
+	for (; i < ARRAY_SIZE(device_map); i++) {
+		if (pdev->device == device_map[i].pf_deviceid) {
+			max_pf = device_map[i].max_pf;
+			for (j = 0; j < max_pf; j++) {
+				if ((dev_func & 0xff) ==
+				    device_map[i].pci_funcid[j]) {
+					found_pf = 1;
+					goto found;
+				}
+			}
+		} else if (pdev->device == device_map[i].vf_deviceid) {
+			max_pf = device_map[i].max_pf;
+			for (j = 0; j < max_pf; j++) {
+				if ((dev_func & 0xff) >=
+				    device_map[i].min_vf_funcid[j]
+				    && (dev_func & 0xff) <=
+				    device_map[i].max_vf_funcid[j]) {
+					found_vf = 1;
+					goto found;
+				}
+			}
+		}
+	}
+
+found:
+	if (found_pf || found_vf) {
+		plat_ex->total_pfs = device_map[i].max_pf;
+		plat_ex->pf_id = device_map[i].pci_funcid[j];
+		plat_ex->xpcs_index = device_map[i].xpcs_index[j];
+		plat_ex->nvme_supported = device_map[i].nvme_supported;
+		plat_ex->raid_supported = device_map[i].raid_supported;
+		plat_ex->pf_max_iatu = device_map[i].pf_max_iatu;
+		plat_ex->vf_total_iatu = device_map[i].vf_total_iatu;
+		plat_ex->upgrade_with_flowing = device_map[i].upgrade_with_flowing;
+		if (found_pf) {
+			plat_ex->vf_offset = 0;
+			plat_ex->is_vf = 0;
+			plat_ex->sriov_supported =
+			    device_map[i].sriov_supported;
+			if (device_map[i].max_pf == 4) {
+				ret = dn200_xgmac_gpio_data_set(pdev, plat_ex,
+						gpio_data_prod_type[0][0], 0);
+			} else {
+				if (plat_ex->hw_pcb_ver_type >
+				    DN200_2P_MAX_PCB_ID
+				    || plat_ex->hw_pcb_ver_type <
+				    DN200_2P_MIN_PCB_ID) {
+					dev_err(&pdev->dev,
+						"funid %x deviceid %x : Invalid hw_pcb_ver_type\n",
+						dev_func, pdev->device);
+					return -EINVAL;
+				}
+				ret = dn200_xgmac_gpio_data_set(pdev, plat_ex,
+								gpio_data_prod_type[1][dn200_pcb_id_map2_gpio_type[plat_ex->hw_pcb_ver_type]],
+								plat_ex->pf_id);
+			}
+			if (ret)
+				return -ENOMEM;
+		} else if (found_vf) {
+			plat_ex->vf_offset =
+			    (dev_func & 0xff) - device_map[i].min_vf_funcid[j];
+			plat_ex->is_vf = 1;
+			plat_ex->sriov_supported = 0;
+			dn200_vf_plat_info_set(pdev, plat, plat_ex);
+		}
+		dn200_xgmac_private_data_set(pdev, plat, plat_ex,
+					     &xge_private_data_table[j],
+					     device_map[i].link_config);
+	} else {
+		dev_err(&pdev->dev, "Invalid funid %x deviceid %x\n", dev_func,
+			pdev->device);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int dn200_default_data(struct pci_dev *pdev,
+			      struct plat_dn200enet_data *plat,
+			      struct plat_dn200_data *plat_ex)
+{
+	int i;
+	int prio_cnt_per_queue, prio;
+
+	plat->has_xgmac = 1;
+	plat->has_gmac = 0;
+	plat->has_gmac4 = 0;
+	plat->force_sf_dma_mode = 1;
+
+	/* Set default value for multicast hash bins */
+	plat->multicast_filter_bins = HASH_TABLE_SIZE;
+
+	/* Set default value for unicast filter entries */
+	plat->unicast_filter_entries = 8;	// 1-31
+
+	/* Set the maxmtu to a default of JUMBO_LEN */
+	plat->maxmtu = JUMBO_LEN;
+
+	/* pbl can be 1, 4, 8, 16, 32, when rxpbl assign to 2, the rx fifo overflow almost disapper
+	 * as pblx8 is true, so the rx dma transfer data length is:
+	 * 2 * 8 * 16(dma beats size is 16) = 256Btyes
+	 * notes: must assign "rxpbl as 2" and "receive store & forward as 0",
+	 * the overflow issue almost resolve and must assign "txpbl as 4",
+	 * as the pcie interface max limit is 512(4 * 8 * 16) bytes
+	 */
+	plat->dma_cfg->pbl = 16;
+	plat->dma_cfg->aal = false;
+	plat->dma_cfg->onekbbe = false;
+	plat->dma_cfg->txpbl = 4;
+	/* to solve kunpen's 2 port iperf at the same,nedd change rxpbl from 2 to 4 */
+	plat->dma_cfg->rxpbl = 4;
+	plat->dma_cfg->pblx8 = true;
+	plat->tso_en = true;
+	/* To save memory, so set split head feature as disabled */
+	plat->sph_disable = true;
+	plat->rss_en = 1;
+
+	if (plat_ex->is_vf) {
+		dn200_vf_plat_info_set(pdev, plat, plat_ex);
+		if (!plat->tx_queues_to_use || plat->tx_queues_to_use > 8)
+			return -EIO;
+	}
+
+	/* MTL Configuration */
+	plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WRR;
+	for (i = 0; i < plat->tx_queues_to_use; i++) {
+		plat->tx_queues_cfg[i].prio = BIT(i);
+		plat->tx_queues_cfg[i].use_prio = true;
+		plat->tx_queues_cfg[i].mode_to_use = MTL_QUEUE_DCB;
+		plat->tx_queues_cfg[i].weight = 10 + i * 5;
+	}
+
+	plat->rx_sched_algorithm = MTL_RX_ALGORITHM_WSP;
+
+	prio_cnt_per_queue = 8 / plat_ex->rx_used_mtl_queues;
+	for (i = 0; i < plat_ex->rx_used_mtl_queues; i++) {
+		/* e.g. use four mtl queues, every queue map two priorities,
+		 * queue 0 map to prio 0 & 1, queue 1 map to prio 2 & 3,
+		 * queue 2 map to prio 4 & 5, queue 3 map to prio 6 & 7,
+		 * 1. L2 tag packets will route to mtl queue 0 ~ 3:
+		 *    prio 0 & 1 route to queue 0, prio 2 & 3 route to queue 1,
+		 *    prio 4 & 5 route to queue 2, prio 6 & 7 route to queue 3.
+		 * 2. untag packets will route to default queue (e.g. queue 0)
+		 */
+		for (prio = i * prio_cnt_per_queue;
+		     prio < (i + 1) * prio_cnt_per_queue; prio++) {
+			plat->rx_queues_cfg[i].prio |= BIT(prio);	/* one bit per prioity */
+		}
+		plat->rx_queues_cfg[i].use_prio = true;
+		plat->rx_queues_cfg[i].mode_to_use = MTL_QUEUE_DCB;
+		plat->rx_queues_cfg[i].pkt_route = 0x0;
+		plat->rx_queues_cfg[i].chan = i;
+		/* for 4 mtl queues, weights are 1, 3, 5, 7, maximum rx weight is 7 */
+		plat->rx_queues_cfg[i].weight = 2 * i + 1;
+	}
+
+	/* AXI Configuration */
+	plat->axi = devm_kzalloc(&pdev->dev, sizeof(*plat->axi), GFP_KERNEL);
+	if (!plat->axi)
+		return -ENOMEM;
+
+	plat->axi->axi_wr_osr_lmt = 0xf;
+	plat->axi->axi_rd_osr_lmt = 0x1f;
+
+	plat->axi->axi_fb = false;
+	plat->axi->axi_blen[0] = 4;
+	plat->axi->axi_blen[1] = 8;
+	plat->axi->axi_blen[2] = 16;
+	plat->axi->axi_blen[3] = 32;
+
+	/* safety configuration */
+	plat->safety_feat_cfg->tsoee = 1;
+	plat->safety_feat_cfg->mrxpee = 1;
+	plat->safety_feat_cfg->mestee = 1;
+	plat->safety_feat_cfg->mrxee = 1;
+	plat->safety_feat_cfg->mtxee = 1;
+	plat->safety_feat_cfg->epsi = 1;
+	plat->safety_feat_cfg->edpp = 1;
+	plat->safety_feat_cfg->prtyen = 1;
+	plat->safety_feat_cfg->tmouten = 1;
+
+	if (plat_ex->sriov_supported)
+		plat_ex->max_num_vlan = 5;
+	else if (plat_ex->is_vf)
+		plat_ex->max_num_vlan = 0;
+	else
+		plat_ex->max_num_vlan = 4094;
+
+	return 0;
+}
+
+static const struct dn200_pci_info dn200_pci_info = {
+	.setup = dn200_default_data,
+};
+
+static bool dn200_is_queue_input_supported(struct pci_dev *pdev,
+					   struct plat_dn200_data *plat_ex)
+{
+	if (plat_ex->funcid == 0 || plat_ex->funcid == 1) {
+		if (queue_max_set < 1) {
+			queue_max_set = 1;
+			dev_err(&pdev->dev,
+				"PF0 and PF1 queue set < 1!,change queue_max_set form illegal value to 1\n");
+			return 0;
+		} else if (queue_max_set > DN200_CH_MAX) {
+			queue_max_set = 8;
+			dev_err(&pdev->dev,
+				"PF0 and PF1 queue set exceed 8, change queue_max_set form iilegal value to 8!\n");
+			return 0;
+		}
+	}
+	return true;
+}
+
+static int dn200_config_multi_msix(struct pci_dev *pdev,
+				   struct plat_dn200enet_data *plat,
+				   struct plat_dn200_data *plat_ex,
+				   struct dn200_resources *res, bool is_purepf)
+{
+	int i, j, ret;
+	struct dn200_ctrl_resource *ctrl = &plat_ex->ctrl;
+
+	ret =
+	    dn200_ena_msix_range(pdev, &plat_ex->ctrl, plat_ex->default_tx_queue_num,
+				 plat_ex->default_rx_queue_num, is_purepf);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"func %s, line %d:  dn200_ena_msix_range fail %d.\n",
+			__func__, __LINE__, ret);
+		return ret;
+	}
+	ret =
+	    irq_queue_map(pdev, &plat_ex->ctrl, plat_ex->default_tx_queue_num,
+			  plat_ex->default_rx_queue_num, true);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"func %s, line %d:  irq_queue_map fail %d.\n", __func__,
+			__LINE__, ret);
+		goto err_irq_config;
+	}
+	/* For TX MSIX */
+	for (i = 0; i < plat_ex->default_tx_queue_num; i++)
+		res->tx_irq[i] = plat_ex->ctrl.msix_entries[i + 1].vector;
+
+	/* For RX MSIX */
+	for (j = 0; j < plat_ex->default_rx_queue_num; j++)
+		res->rx_irq[j] = plat_ex->ctrl.msix_entries[j + 1 + i].vector;
+
+	if (plat_ex->sriov_supported ||
+		plat_ex->pdev->device == DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF) {
+		res->lpi_irq = plat_ex->ctrl.msix_entries[j + 1 + i + 0].vector;
+		res->irq = plat_ex->ctrl.msix_entries[j + 1 + i + 1].vector;
+		res->sfty_ce_irq =
+		    plat_ex->ctrl.msix_entries[j + 1 + i + 2].vector;
+		res->sfty_ue_irq =
+		    plat_ex->ctrl.msix_entries[j + 1 + i + 3].vector;
+	}
+	plat->multi_msi_en = 1;
+	return 0;
+err_irq_config:
+	devm_free_irq(ctrl->dev,
+		      ctrl->msix_entries[plat_ex->total_irq - 2].vector,
+		      plat_ex);
+	devm_free_irq(ctrl->dev,
+		      ctrl->msix_entries[plat_ex->total_irq - 1].vector,
+		      plat_ex);
+	devm_free_irq(ctrl->dev, ctrl->msix_entries[0].vector, ctrl);
+	pci_disable_msix(pdev);
+	return ret;
+}
+
+#define DN200_IRQ(num0, num1, num2, num3, num4) \
+	((num0) | ((num1) << 6) | ((num2) << 12) | ((num3) << 18) | ((num4) << 24))
+
+static int dn200_config_multi_msi(struct pci_dev *pdev,
+				  struct plat_dn200enet_data *plat,
+				  struct plat_dn200_data *plat_ex,
+				  struct dn200_resources *res,
+				  bool support_msix)
+{
+#define MAX_RX_VECT_NUMB 16
+#define MAX_TX_VECT_NUMB 16
+	int ret;
+	int i;
+	int msi_num;
+	int intr_msi_mod = 0, intr_src_mask = 0;
+	int rx_v[MAX_RX_VECT_NUMB];
+	int tx_v[MAX_TX_VECT_NUMB];
+
+	memset(rx_v, 0, sizeof(rx_v));
+	memset(tx_v, 0, sizeof(tx_v));
+
+	plat->msi_rx_base_vec = 1;
+	for (i = 0; i < plat->rx_queues_to_use; ++i)
+		rx_v[i] = plat->msi_rx_base_vec + i;
+
+	plat->msi_tx_base_vec = plat->msi_rx_base_vec + plat->rx_queues_to_use;
+	for (i = 0; i < plat->tx_queues_to_use; ++i)
+		tx_v[i] = plat->msi_tx_base_vec + i;
+
+	plat->msi_sfty_ce_vec = plat->msi_tx_base_vec + plat->tx_queues_to_use;
+	plat->msi_sfty_ue_vec = plat->msi_sfty_ce_vec + 1;
+	plat->msi_mac_vec = plat->msi_sfty_ue_vec + 1;
+	plat->msi_lpi_vec = plat->msi_mac_vec + 1;
+	msi_num = plat->msi_lpi_vec;
+	ret =
+	    pci_alloc_irq_vectors(pdev, roundup_pow_of_two(msi_num + 1),
+				  roundup_pow_of_two(msi_num + 1), PCI_IRQ_MSI);
+	if (ret < 0) {
+		if (!support_msix)
+			dev_err(&pdev->dev,
+				"func = %s, line = %d: multi vectors alloc fail! %d\n",
+				__func__, __LINE__, ret);
+		return ret;
+	}
+	dev_info(&pdev->dev, "Succeed to allocate %d MSI vectors\n", ret);
+	switch (plat->bus_id) {
+	case 1:
+	case 2:
+		/*remap each interrupt source to MSI vector:
+		 *  Rx INT source 1~8  (RX DMA Channel 1~16 INT) to vector 1 ~ RX_QUEUES
+		 *  Tx INT source 17~24(TX DMA Channel 1~16 INT) to
+		 *  vector (RX_QUEUES + 1) ~ (RX_QUEUES + TX_QUEUES)
+		 *  Sfty RE INT source 33 to vector msi_sfty_ce_vec
+		 *  Sfty UE INT source 34 to vector msi_sfty_ue_vec
+		 *  PMT INT source 35 to vector msi_mac_vec
+		 *  LPI INT source 36 to vector msi_lpi_vec
+		 *  other INT source 37 to vector 0 => of no use and mask
+		 */
+		writel(DN200_IRQ(0, rx_v[0], rx_v[1], rx_v[2], rx_v[3]),
+		       res->addr + XGE_MSI_INTR_SRCMAP0_4(plat->bus_id - 1));
+		writel(DN200_IRQ(rx_v[4], rx_v[5], rx_v[6], rx_v[7], rx_v[8]),
+		       res->addr + XGE_MSI_INTR_SRCMAP5_9(plat->bus_id - 1));
+		writel(DN200_IRQ
+		       (rx_v[9], rx_v[10], rx_v[11], rx_v[12], rx_v[13]),
+		       res->addr + XGE_MSI_INTR_SRCMAP10_14(plat->bus_id - 1));
+		writel(DN200_IRQ(rx_v[14], rx_v[15], tx_v[0], tx_v[1], tx_v[2]),
+		       res->addr + XGE_MSI_INTR_SRCMAP15_19(plat->bus_id - 1));
+		writel(DN200_IRQ(tx_v[3], tx_v[4], tx_v[5], tx_v[6], tx_v[7]),
+		       res->addr + XGE_MSI_INTR_SRCMAP20_24(plat->bus_id - 1));
+		writel(DN200_IRQ
+		       (tx_v[8], tx_v[9], tx_v[10], tx_v[11], tx_v[12]),
+		       res->addr + XGE_MSI_INTR_SRCMAP25_29(plat->bus_id - 1));
+		writel(DN200_IRQ
+		       (tx_v[13], tx_v[14], tx_v[15], plat->msi_sfty_ce_vec,
+			plat->msi_sfty_ue_vec),
+		       res->addr + XGE_MSI_INTR_SRCMAP30_34(plat->bus_id - 1));
+		writel(DN200_IRQ
+		       (plat->msi_mac_vec, plat->msi_lpi_vec, 0x0, 0x0, 0x0),
+		       res->addr + XGE_MSI_INTR_SRCMAP35_39(plat->bus_id - 1));
+		writel(DN200_IRQ(0, 0, plat_ex->msi_xpcs_vec, 0, 0),
+		       res->addr + XGE_MSI_INTR_SRCMAP40_44(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP45_49(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP50_54(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP55_59(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP60_64(plat->bus_id - 1));
+
+		/* 1. Enable xge interrupt source work at msi mode, 0: not msi mode; 1: msi mode
+		 * 2. Config interrupt vector mask for or msi, 0: use or no mask; 1: no use or mask
+		 * e.g.
+		 * 1-8, 17-28 msi mode enable:
+		 * writel(0x01FE01FE, res->addr + XGE_MSI_INTR_EN_LOW(plat->bus_id - 1));
+		 * 33-36 msi mode enable:
+		 * writel(0x0000001E, res->addr + XGE_MSI_INTR_EN_HIGH(plat->bus_id - 1));
+		 * 1-8, 17-24 no mask，vector 0 no use and mask:
+		 * writel(0xFE01FE01, res->addr + XGE_MSI_INTR_MASK_LOW(plat->bus_id - 1));
+		 * 33-36 no mask:
+		 * writel(0xFFFFFFE1, res->addr + XGE_MSI_INTR_MASK_HIGH(plat->bus_id - 1));
+		 */
+		intr_src_mask = 0xFFFFFFFF;
+		for (i = 0; i < MAX_RX_VECT_NUMB; ++i) {
+			if (rx_v[i]) {
+				intr_msi_mod |= 1 << (i + 1);
+				intr_src_mask &= ~(1 << (i + 1));
+			}
+		}
+		for (i = 0; i < MAX_TX_VECT_NUMB - 1; ++i) {
+			if (tx_v[i]) {
+				intr_msi_mod |= 1 << (MAX_RX_VECT_NUMB + i + 1);
+				intr_src_mask &=
+				    ~(1 << (MAX_RX_VECT_NUMB + i + 1));
+			}
+		}
+		writel(intr_msi_mod,
+		       res->addr + XGE_MSI_INTR_EN_LOW(plat->bus_id - 1));
+		writel(intr_src_mask,
+		       res->addr + XGE_MSI_INTR_MASK_LOW(plat->bus_id - 1));
+		if (plat_ex->msi_xpcs_vec) {
+			intr_msi_mod = 0x0000041E;
+			intr_src_mask = 0xFFFFFBE1;
+		} else {
+			intr_msi_mod = 0x0000001E;
+			intr_src_mask = 0xFFFFFFE1;
+		}
+
+		if (tx_v[MAX_TX_VECT_NUMB - 1]) {
+			intr_msi_mod |= 0x01;
+			intr_src_mask &= ~0x01;
+		}
+		writel(intr_msi_mod,
+		       res->addr + XGE_MSI_INTR_EN_HIGH(plat->bus_id - 1));
+		writel(intr_src_mask,
+		       res->addr + XGE_MSI_INTR_MASK_HIGH(plat->bus_id - 1));
+		break;
+	case 3:
+	case 4:
+		/*remap each interrupt source to MSI vector:
+		 * Rx INT source 1~2(RX DMA Channel 1~2 INT) to vector 1 ~ RX_QUEUES
+		 * Tx INT source 3~4(TX DMA Channel 1~2 INT) to
+		 * vector (RX_QUEUES + 1) ~ (RX_QUEUES + TX_QUEUES)
+		 * Sfty RE INT source 5 to vector msi_sfty_ce_vec
+		 * Sfty UE INT source 6 to vector msi_sfty_ue_vec
+		 * PMT INT source 7 to vector msi_mac_vec
+		 * LPI INT source 8 to vector msi_lpi_vec
+		 * other INT source to vector 0 => of no use and mask
+		 */
+		writel(DN200_IRQ(0, rx_v[0], rx_v[1], tx_v[0], tx_v[1]),
+		       res->addr + XGE_MSI_INTR_SRCMAP0_4(plat->bus_id - 1));
+		writel(DN200_IRQ
+		       (plat->msi_sfty_ce_vec, plat->msi_sfty_ue_vec,
+			plat->msi_mac_vec, plat->msi_lpi_vec, 0x0),
+		       res->addr + XGE_MSI_INTR_SRCMAP5_9(plat->bus_id - 1));
+		writel(DN200_IRQ(0, 0, 0, 0, plat_ex->msi_xpcs_vec),
+		       res->addr + XGE_MSI_INTR_SRCMAP10_14(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP15_19(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP20_24(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP25_29(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP30_34(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP35_39(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP40_44(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP45_49(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP50_54(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP55_59(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP60_64(plat->bus_id - 1));
+
+		// platform interrupt source mask
+		if (plat_ex->msi_xpcs_vec) {
+			writel(0x000041FE,
+			       res->addr + XGE_MSI_INTR_EN_LOW(plat->bus_id -
+							       1));
+			writel(0xFFFFBE01,
+			       res->addr + XGE_MSI_INTR_MASK_LOW(plat->bus_id -
+								 1));
+		} else {
+			writel(0x000001FE,
+			       res->addr + XGE_MSI_INTR_EN_LOW(plat->bus_id -
+							       1));
+			writel(0xFFFFFE01,
+			       res->addr + XGE_MSI_INTR_MASK_LOW(plat->bus_id -
+								 1));
+		}
+		writel(0x00000000,
+		       res->addr + XGE_MSI_INTR_EN_HIGH(plat->bus_id - 1));
+		writel(0xFFFFFFFF,
+		       res->addr + XGE_MSI_INTR_MASK_HIGH(plat->bus_id - 1));
+		break;
+	default:
+		pci_free_irq_vectors(pdev);
+		dev_info(&pdev->dev,
+			 "func = %s, line = %d:plat->bus_id = %d,Failed to enable multi MSI retval.\n",
+			 __func__, __LINE__, plat->bus_id);
+		return DN200_FAILURE;
+	}
+	/* For Rx INT */
+	for (i = 0; i < plat->rx_queues_to_use; i++) {
+		res->rx_irq[i] =
+		    pci_irq_vector(pdev, plat->msi_rx_base_vec + i);
+	}
+	/* For Tx INT */
+	for (i = 0; i < plat->tx_queues_to_use; i++) {
+		res->tx_irq[i] =
+		    pci_irq_vector(pdev, plat->msi_tx_base_vec + i);
+	}
+	/* For PMT INT */
+	if (plat->msi_mac_vec < DN200_MSI_VEC_MAX)
+		res->irq = pci_irq_vector(pdev, plat->msi_mac_vec);
+	/* For LPI INT */
+	if (plat->msi_lpi_vec < DN200_MSI_VEC_MAX)
+		res->lpi_irq = pci_irq_vector(pdev, plat->msi_lpi_vec);
+	/* For CE Safety INT */
+	if (plat->msi_sfty_ce_vec < DN200_MSI_VEC_MAX)
+		res->sfty_ce_irq = pci_irq_vector(pdev, plat->msi_sfty_ce_vec);
+	/* For UE Safety INT */
+	if (plat->msi_sfty_ue_vec < DN200_MSI_VEC_MAX)
+		res->sfty_ue_irq = pci_irq_vector(pdev, plat->msi_sfty_ue_vec);
+
+	/* For XPCS LINK INT */
+	if (plat_ex->msi_xpcs_vec &&
+	    plat_ex->msi_xpcs_vec < DN200_MSI_VEC_MAX)
+		res->xpcs_vec = pci_irq_vector(pdev, plat_ex->msi_xpcs_vec);
+
+	plat->multi_msi_en = 1;
+	return 0;
+}
+
+static int dn200_config_single_msi(struct pci_dev *pdev,
+				   struct plat_dn200enet_data *plat,
+				   struct dn200_resources *res)
+{
+	int ret = 0;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Fail to enable single IRQ\n");
+		return ret;
+	}
+
+	if (pdev->msix_enabled) {
+		/* platform MSIX enable vector 0 */
+		writel(0x1, res->addr + XGE_MSIX_INTR_EN_LOW(plat->bus_id - 1));
+		writel(0x0,
+		       res->addr + XGE_MSIX_INTR_EN_HIGH(plat->bus_id - 1));
+		writel(0xFFFFFFFE,
+		       res->addr + XGE_MSIX_INTR_MASK_LOW(plat->bus_id - 1));
+		writel(0xFFFFFFFF,
+		       res->addr + XGE_MSIX_INTR_MASK_HIGH(plat->bus_id - 1));
+
+		dev_info(&pdev->dev, "Succeed to enable MSI-X single IRQ\n");
+	} else if (pdev->msi_enabled) {
+		/* map each interrupt source to MSI vector 0 */
+		writel(0, res->addr + XGE_MSI_INTR_SRCMAP0_4(plat->bus_id - 1));
+		writel(0, res->addr + XGE_MSI_INTR_SRCMAP5_9(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP10_14(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP15_19(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP20_24(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP25_29(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP30_34(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP35_39(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP40_44(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP45_49(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP50_54(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP55_59(plat->bus_id - 1));
+		writel(0,
+		       res->addr + XGE_MSI_INTR_SRCMAP60_64(plat->bus_id - 1));
+		/* enable and unmask int0 */
+		writel(0x1, res->addr + XGE_MSI_INTR_EN_LOW(plat->bus_id - 1));
+		writel(0, res->addr + XGE_MSI_INTR_EN_HIGH(plat->bus_id - 1));
+		writel(0xFFFFFFFE,
+		       res->addr + XGE_MSI_INTR_MASK_LOW(plat->bus_id - 1));
+		writel(0xFFFFFFFF,
+		       res->addr + XGE_MSI_INTR_MASK_HIGH(plat->bus_id - 1));
+
+		dev_info(&pdev->dev, "Succeed to enable MSI single IRQ\n");
+	} else {
+		/* Legacy Pin interrupt, do nothing */
+		dev_info(&pdev->dev, "Succeed to enable Legacy single IRQ\n");
+	}
+
+	res->irq = pci_irq_vector(pdev, 0);
+	plat->multi_msi_en = 0;
+	return 0;
+}
+
+int dn200_config_interrupt(struct pci_dev *pdev,
+				  struct plat_dn200enet_data *plat,
+				  struct plat_dn200_data *plat_ex,
+				  struct dn200_resources *res, bool is_nvme_pf)
+{
+	int ret = 0;
+
+	if (!plat_ex->is_vf && !plat_ex->sriov_supported && !is_nvme_pf) {
+		plat_ex->use_msi = true;
+		ret =
+		    dn200_config_multi_msi(pdev, plat, plat_ex, res,
+					   plat_ex->nvme_supported);
+		if (ret && plat_ex->nvme_supported) {
+			ret =
+			    dn200_config_multi_msix(pdev, plat, plat_ex, res,
+						    is_nvme_pf);
+			plat_ex->use_msi = false;
+		} else if (ret) {
+			ret = dn200_config_single_msi(pdev, plat, res);
+			if (ret) {
+				dev_err(&pdev->dev,
+					"%s: ERROR, failed to enable single IRQ\n",
+					__func__);
+				return ret;
+			}
+		}
+	} else {
+		ret =
+		    dn200_config_multi_msix(pdev, plat, plat_ex, res,
+					    is_nvme_pf);
+		plat_ex->use_msi = false;
+	}
+	if (plat_ex->use_msi) {
+		ret = dn200_ctrl_ccena(pdev, 1, 1, false);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"func %s, line %d: ctrl cc enable timeout\n",
+				__func__, __LINE__);
+		}
+	}
+	return ret;
+}
+
+#define PURE_PF_NO_NVME 1
+#define PURE_PF_NVME 2
+static u8 dn200_pure_pf_type(struct pci_dev *pdev)
+{
+	int i = 0;
+
+	for (i = 0; i < ARRAY_SIZE(PURE_PF_DEVICE); i++) {
+		if (pdev->device == PURE_PF_DEVICE[i])
+			return PURE_PF_NO_NVME;
+	}
+	for (i = 0; i < ARRAY_SIZE(NVME_PURE_PF_DEVICE); i++) {
+		if (pdev->device == NVME_PURE_PF_DEVICE[i])
+			return PURE_PF_NVME;
+	}
+	return 0;
+}
+
+static bool dn200_is_sriov_vf(struct pci_dev *pdev)
+{
+	int i = 0;
+
+	for (; i < ARRAY_SIZE(SRIOV_VF_DEVICE); i++) {
+		if (pdev->device == SRIOV_VF_DEVICE[i])
+			return true;
+	}
+	return false;
+}
+
+static bool dn200_is_extern_phy(struct pci_dev *pdev)
+{
+	int i = 0;
+
+	for (; i < ARRAY_SIZE(EXTERN_PHY_DEVICE); i++) {
+		if (pdev->device == EXTERN_PHY_DEVICE[i])
+			return true;
+	}
+	return false;
+}
+
+static bool dn200_is_4_port(struct pci_dev *pdev)
+{
+	int i = 0;
+
+	for (; i < ARRAY_SIZE(SRIOV_4P_DEVICE); i++) {
+		if (pdev->device == SRIOV_4P_DEVICE[i])
+			return true;
+	}
+	return false;
+}
+
+static u8 dn200_gpio_read(struct pci_dev *pdev, struct dn200_resources *res,
+			  u8 offset)
+{
+	u32 value = 0;
+	u8 offset_pin = 0;
+	u8 val = 0;
+	u32 gpio_offset = 0;
+	u32 reg_off_read = 0;
+
+	if (dn200_is_4_port(pdev)) {
+		gpio_offset += DN200_SFPCTRL_MODE0_BAROFF;
+		reg_off_read = 8;
+	} else {
+		gpio_offset += DN200_SFPCTRL_MODE1_BAROFF;
+		reg_off_read = 0;
+	}
+
+	/*may be offset is greater than 31 */
+	offset_pin = (offset >> 5) << 2;
+	offset = offset & 0x1f;
+	value = ioread32(res->addr + gpio_offset + offset_pin + reg_off_read);
+	val = (value >> (offset)) & 0x1;
+
+	return val;
+}
+
+#define DN200_PCB_FW_GPIO_PIN0 49
+#define DN200_PCB_FW_GPIO_PIN1 38
+#define DN200_PCB_FW_GPIO_PIN2 29
+static int dn200_hw_pcb_version_from_gpio(struct pci_dev *pdev,
+					  struct plat_dn200_data *plat_ex,
+					  struct dn200_resources *res,
+					  bool is_no_nvme)
+{
+	int ret = 0;
+	u8 val1;
+	u32 val;
+	/*vf not support gpio */
+	if (dn200_is_sriov_vf(pdev))
+		return ret;
+
+	if (!is_no_nvme) {
+		ret = get_pcb_type(&plat_ex->ctrl, &val);
+		plat_ex->hw_pcb_ver_type = dn200_pcb_type[val].value;
+		return ret;
+	}
+
+	/*vf not support gpio */
+	if (dn200_is_sriov_vf(pdev))
+		return ret;
+	/*gpio 49 for ver */
+	val1 = (dn200_gpio_read(pdev, res, DN200_PCB_FW_GPIO_PIN0) << 2);
+	/*gpio 29 for ver */
+	val1 |= (dn200_gpio_read(pdev, res, DN200_PCB_FW_GPIO_PIN1) << 1);
+	/*gpio 38 for ver */
+	val1 |= dn200_gpio_read(pdev, res, DN200_PCB_FW_GPIO_PIN2);
+
+	plat_ex->hw_pcb_ver_type = dn200_pcb_type[val1].value;
+	return ret;
+
+}
+
+/**
+ * dn200_pci_probe
+ *
+ * @pdev: pci device pointer
+ * @id: pointer to table of device id/id's.
+ *
+ * Description: This probing function gets called for all PCI devices which
+ * match the ID table and are not "owned" by other driver yet. This function
+ * gets passed a "struct pci_dev *" for each device whose entry in the ID table
+ * matches the device. The probe functions returns zero when the driver choose
+ * to take "ownership" of the device or an error code(-ve no) otherwise.
+ */
+static int dn200_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct dn200_pci_info *info = (struct dn200_pci_info *)id->driver_data;
+	struct plat_dn200enet_data *plat = NULL;
+	struct plat_dn200_data *plat_ex = NULL;
+	struct dn200_resources res;
+	int i;
+	int ret;
+	u8 pf_type = dn200_pure_pf_type(pdev);
+
+	plat = devm_kzalloc(&pdev->dev, sizeof(*plat), GFP_KERNEL);
+	if (!plat)
+		return -ENOMEM;
+
+	plat_ex = devm_kzalloc(&pdev->dev, sizeof(*plat_ex), GFP_KERNEL);
+	if (!plat)
+		return -ENOMEM;
+
+	plat->mdio_bus_data = devm_kzalloc(&pdev->dev,
+					   sizeof(*plat->mdio_bus_data),
+					   GFP_KERNEL);
+	if (!plat->mdio_bus_data)
+		return -ENOMEM;
+
+	plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg),
+				     GFP_KERNEL);
+	if (!plat->dma_cfg)
+		return -ENOMEM;
+
+	plat->safety_feat_cfg = devm_kzalloc(&pdev->dev,
+					     sizeof(*plat->safety_feat_cfg),
+					     GFP_KERNEL);
+	if (!plat->safety_feat_cfg)
+		return -ENOMEM;
+	/* Enable pci device */
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		dev_err(&pdev->dev, "%s: ERROR: failed to enable device\n",
+			__func__);
+		return ret;
+	}
+	if (!dn200_is_sriov_vf(pdev))
+		pci_aer_clear_nonfatal_status(pdev);
+	/* set up for high or low dma */
+	if (!dn200_is_sriov_vf(pdev)) {
+		plat->addr64 = 64;
+		ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+		if (ret) {
+			ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+			if (ret) {
+				dev_err(&pdev->dev,
+					"DMA configuration failed: 0x%x\n", ret);
+				goto err_dma;
+			}
+			plat->addr64 = 32;
+		}
+	} else {
+		ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+		if (ret) {
+			dev_err(&pdev->dev,
+				"DMA configuration failed: 0x%x\n", ret);
+			goto err_dma;
+		}
+		plat->addr64 = 32;
+	}
+	plat_ex->ctrl.addr64 = plat->addr64;
+	/* Get the base address of device */
+	for (i = 0; i < DN200_PCI_BAR_NUM; i++) {
+		if (pci_resource_len(pdev, i) == 0)
+			continue;
+
+		ret = pcim_iomap_regions(pdev, BIT(i), pci_name(pdev));
+		if (ret)
+			goto err_iomap;
+	}
+	pci_set_master(pdev);
+	memset(&res, 0, sizeof(res));
+
+	if (pci_resource_len(pdev, 0) != 0)
+		res.ctrl_addr = pcim_iomap_table(pdev)[0];
+
+	if (pci_resource_len(pdev, 2) != 0) {
+		res.addr = pcim_iomap_table(pdev)[2];
+		plat_ex->io_addr = res.addr;
+	}
+	if (pci_resource_len(pdev, 4) != 0)
+		res.mail = pcim_iomap_table(pdev)[4] + 0x10000;
+
+	if (!dn200_hwif_id_check(res.addr)) {
+		dev_err(&pdev->dev, "func %s: %s\n", __func__,
+			DN200_PCIE_BAR_ERR);
+		goto err_ctrl_cc;
+	}
+
+	if (!(pf_type == PURE_PF_NO_NVME || (dn200_is_sriov_vf(pdev)))) {
+		ret = dn200_ctrl_ccena(pdev, 0, 1, false);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"func %s, line %d: ctrl cc enable timeout\n",
+				__func__, __LINE__);
+			goto err_ctrl_cc;
+		}
+		set_bit(ADMIN_QUEUE_INITED, &plat_ex->ctrl.admin_state);
+	}
+
+	plat_ex->pdev = pdev;
+
+	if (!(pf_type == PURE_PF_NO_NVME)) {
+		ret =
+		    admin_queue_configure(pdev, &plat_ex->ctrl,
+					  pf_type == PURE_PF_NVME, dn200_is_extern_phy(pdev));
+		if (ret)
+			goto err_admin_queue;
+	} else {
+		plat_ex->funcid = pdev->devfn;
+	}
+	ret = irq_info_pfvf_release(pdev, &plat_ex->ctrl, true);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"func %s, line %d: vf_release fail\n",
+			__func__, __LINE__);
+		goto err_admin_queue;
+	}
+	ret = dn200_is_queue_input_supported(pdev, plat_ex);
+	if (!ret) {
+		dev_err(&pdev->dev,
+			"func %s, line %d: PF0 and PF1 queue set is illegal\n",
+			__func__, __LINE__);
+		goto err_admin_queue;
+	}
+	ret = dn200_hw_pcb_version_from_gpio(pdev, plat_ex, &res,
+					   pf_type == PURE_PF_NO_NVME);
+	if (ret)
+		goto err_hw_get;
+
+	ret = dn200_xge_info_set(pdev, plat, plat_ex);
+	if (ret)
+		goto err_info_set;
+	ret = info->setup(pdev, plat, plat_ex);
+	if (ret) {
+		dev_err(&pdev->dev, "func %s, line %d: info setup fail\n",
+			__func__, __LINE__);
+		goto err_info_set;
+	}
+
+	ret =
+	    dn200_config_interrupt(pdev, plat, plat_ex, &res,
+				   pf_type == PURE_PF_NVME);
+	if (ret)
+		goto err_alloc_irq;
+	if (pf_type == (u8) PURE_PF_NO_NVME)
+		plat_ex->speed_cmd = 0;
+	else
+		plat_ex->speed_cmd = 1;
+
+	if (!plat_ex->is_vf && plat_ex->has_xpcs) {
+		ret = dn200_clock_phy_detect(pdev, &plat_ex->ctrl);
+		if (!ret)
+			goto err_clk_detect;
+	}
+
+	ret = dn200_dvr_probe(&pdev->dev, plat, plat_ex, &res);
+	if (ret)
+		goto err_dvr_probe;
+	if (!plat_ex->is_vf)
+		dn200_configure_timestamp(&plat_ex->ctrl);
+	if (!plat_ex->is_vf)
+		dn200_register_nvme_device(&plat_ex->ctrl);
+	return 0;
+
+err_dvr_probe:
+err_clk_detect:
+	if (!plat_ex->use_msi) {
+		dn200_ctrl_res_free(pdev, &plat_ex->ctrl);
+		pci_disable_msix(pdev);
+	} else {
+		pci_free_irq_vectors(pdev);
+	}
+err_alloc_irq:
+err_admin_queue:
+err_info_set:
+err_ctrl_cc:
+err_hw_get:
+	for (i = 0; i < DN200_PCI_BAR_NUM; i++) {
+		if (pci_resource_len(pdev, i) == 0)
+			continue;
+		pcim_iounmap_regions(pdev, BIT(i));
+	}
+err_iomap:
+err_dma:
+	pci_disable_device(pdev);
+
+	return ret;
+}
+
+/**
+ * dn200_pci_remove
+ *
+ * @pdev: platform device pointer
+ * Description: this function calls the main to free the net resources
+ * and releases the PCI resources.
+ */
+static void dn200_pci_remove(struct pci_dev *pdev)
+{
+	int i;
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv;
+	struct plat_dn200_data *plat_ex;
+	bool pcie_avalid = false;
+
+	if (!ndev)
+		return;
+
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return;
+
+	if (test_bit(DN200_PCIE_UNAVAILD, &priv->state) || !dn200_hwif_id_check(priv->ioaddr))
+		pcie_avalid = false;
+	else
+		pcie_avalid = true;
+
+	/*test DN200_RESETING to avoid reset by other subtask */
+	while (test_bit(DN200_RESETING, &priv->state))
+		usleep_range(1000, 2000);
+	set_bit(DN200_IN_REMOVE, &priv->state);
+	plat_ex = priv->plat_ex;
+	plat_ex->ctrl.pcie_ava = pcie_avalid;
+	if (!PRIV_IS_VF(priv))
+		dn200_unregister_nvme_device(&plat_ex->ctrl);
+	if (pcie_avalid)
+		dn200_sriov_disable(priv);
+	dn200_dvr_remove(&pdev->dev);
+	/* just sriov supported pf or vf need ctrl feature */
+	if (!plat_ex->use_msi) {
+		dn200_ctrl_res_free(pdev, &plat_ex->ctrl);
+		pci_disable_msix(pdev);
+	} else {
+		pci_free_irq_vectors(pdev);
+	}
+
+	for (i = 0; i < DN200_PCI_BAR_NUM; i++) {
+		if (pci_resource_len(pdev, i) == 0)
+			continue;
+		pcim_iounmap_regions(pdev, BIT(i));
+	}
+	pci_disable_device(pdev);
+}
+
+static int __maybe_unused dn200_pci_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+	struct plat_dn200_data *plat_ex = NULL;
+	int ret;
+
+	if (!ndev)
+		return -EINVAL;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return -EINVAL;
+	plat_ex = priv->plat_ex;
+
+	/*Already suspended */
+	if (test_and_set_bit(DN200_SUSPENDED, &priv->state))
+		return 0;
+
+	/* We need to hold the RTNL lock prior to restoring interrupt schemes,
+	 * since we're going to be restoring queues
+	 */
+	set_bit(DN200_SYS_SUSPENDED, &priv->state);
+	ret = dn200_suspend(dev);
+	if (ret) {
+		clear_bit(DN200_SUSPENDED, &priv->state);
+		return ret;
+	}
+	clear_bit(ADMIN_QUEUE_INITED, &plat_ex->ctrl.admin_state);
+	return 0;
+}
+
+static int __maybe_unused dn200_pci_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+	int ret = 0;
+
+	if (!ndev)
+		return -EINVAL;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return -EINVAL;
+
+	if (priv->mii)
+		msleep(10000);
+	if (!test_and_clear_bit(DN200_SUSPENDED, &priv->state))
+		return 0;
+	schedule_work(&priv->retask);
+	return ret;
+}
+
+/**
+ * dn200_shutdown - PCI callback for shutting down
+ * @pdev: PCI device information struct
+ **/
+static void dn200_shutdown(struct pci_dev *pdev)
+{
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+
+	/* After fw abnormal, user remove and then insert driver,
+	 * although driver probe failure, but it will not be removed
+	 * now reboot the system, and will trigger shutdown,
+	 * but netdevice is not init, so can't access it
+	 */
+	if (!ndev)
+		return;
+
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return;
+
+	/*Already suspended */
+	if (test_and_set_bit(DN200_SUSPENDED, &priv->state))
+		return;
+
+	/* We need to hold the RTNL lock prior to restoring interrupt schemes,
+	 * since we're going to be restoring queues
+	 */
+	dn200_suspend(&pdev->dev);
+	if (!priv->plat_ex->use_msi) {
+		dn200_ctrl_res_free(pdev, &priv->plat_ex->ctrl);
+		pci_disable_msix(pdev);
+	} else
+		pci_free_irq_vectors(pdev);
+}
+
+static SIMPLE_DEV_PM_OPS(dn200_pm_ops, dn200_pci_suspend, dn200_pci_resume);
+
+static const struct pci_device_id dn200_id_table[] = {
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_4P_PURE_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_4P_SRIOV_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_4P_SRIOV_VF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_PURE_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_SRIOV_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_SRIOV_VF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_1G_4P_PURE_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_1G_4P_SRIOV_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_1G_4P_SRIOV_VF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_COPP_1G_4P_PURE_PF), .driver_data =
+	 (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_NVME_PUREPF),
+	.driver_data = (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF),
+	.driver_data = (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF),
+	.driver_data = (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_PF),
+	.driver_data = (kernel_ulong_t) &dn200_pci_info },
+	{ PCI_VDEVICE(DAPUSTOR, DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_VF),
+	.driver_data = (kernel_ulong_t) &dn200_pci_info },
+	{ }
+};
+
+/**
+ * dn200_pci_error_detected - warning that something funky happened in PCI land
+ * @pdev: PCI device information struct
+ * @error: the type of PCI error
+ *
+ * Called to warn that something happened and the error handling steps
+ * are in progress.  Allows the driver to quiesce things, be ready for
+ * remediation.
+ **/
+static pci_ers_result_t dn200_pci_error_detected(struct pci_dev *pdev,
+						 pci_channel_state_t error)
+{
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+
+	dev_info(&pdev->dev, "%s: error %d\n", __func__, error);
+	if (!ndev) {
+		dev_info(&pdev->dev,
+			 "Cannot recover - error happened during device probe\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	if (PRIV_IS_VF(priv))
+		return PCI_ERS_RESULT_NO_AER_DRIVER;
+	/* shutdown all operations */
+	if (!test_and_set_bit(DN200_SUSPENDED, &priv->state))
+		dn200_suspend(&pdev->dev);
+	/* Request a slot reset */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * dn200_pci_error_slot_reset - a PCI slot reset just happened
+ * @pdev: PCI device information struct
+ *
+ * Called to find if the driver can work with the device now that
+ * the pci slot has been reset.  If a basic connection seems good
+ * (registers are readable and have sane content) then return a
+ * happy little PCI_ERS_RESULT_xxx.
+ **/
+static pci_ers_result_t dn200_pci_error_slot_reset(struct pci_dev *pdev)
+{
+	pci_ers_result_t result;
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+	int err;
+
+	if (!ndev)
+		return PCI_ERS_RESULT_DISCONNECT;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return PCI_ERS_RESULT_DISCONNECT;
+	dev_info(&pdev->dev, "%s enter reset\n", __func__);
+	if (PRIV_IS_VF(priv))
+		return PCI_ERS_RESULT_NO_AER_DRIVER;
+	if (pci_enable_device_mem(pdev)) {
+		dev_warn(&pdev->dev,
+			 "Cannot re-enable PCI device after reset.\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+	} else {
+		pci_set_master(pdev);
+		pci_restore_state(pdev);
+		pci_save_state(pdev);
+		result = PCI_ERS_RESULT_RECOVERED;
+	}
+
+	err = pci_aer_clear_nonfatal_status(pdev);
+	if (err) {
+		dev_info(&pdev->dev,
+			 "pci_aer_clear_nonfatal_status() failed, error %d\n",
+			 err);
+		result = PCI_ERS_RESULT_DISCONNECT;
+	}
+	return result;
+}
+
+/**
+ * dn200_pci_error_reset_prepare - prepare device driver for pci reset
+ * @pdev: PCI device information struct
+ */
+static void dn200_pci_error_reset_prepare(struct pci_dev *pdev)
+{
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+
+	if (!ndev)
+		return;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return;
+
+	if (PRIV_IS_VF(priv))
+		return;
+	dev_info(&pdev->dev, "%s enter reset prepare\n", __func__);
+	/* shutdown all operations */
+	if (!test_and_set_bit(DN200_SUSPENDED, &priv->state))
+		dn200_suspend(&pdev->dev);
+}
+
+/**
+ * dn200_pci_error_reset_done - pci reset done, device driver reset can begin
+ * @pdev: PCI device information struct
+ */
+static void dn200_pci_error_reset_done(struct pci_dev *pdev)
+{
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = NULL;
+
+	if (!ndev)
+		return;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return;
+
+	if (PRIV_IS_VF(priv))
+		return;
+	dev_info(&pdev->dev, "%s enter reset done\n", __func__);
+	if (!test_and_clear_bit(DN200_SUSPENDED, &priv->state))
+		return;
+	ctrl_reset(&priv->plat_ex->ctrl, true);
+	dn200_resume(&pdev->dev);
+}
+
+/**
+ * dn200_pci_error_resume - restart operations after PCI error recovery
+ * @pdev: PCI device information struct
+ *
+ * Called to allow the driver to bring things back up after PCI error
+ * and/or reset recovery has finished.
+ **/
+static void dn200_pci_error_resume(struct pci_dev *pdev)
+{
+	struct net_device *ndev = dev_get_drvdata(&pdev->dev);
+	struct dn200_priv *priv = netdev_priv(ndev);
+
+	if (!ndev)
+		return;
+	priv = netdev_priv(ndev);
+	if (!priv)
+		return;
+
+	dev_info(&pdev->dev, "%s pci error detect!\n", __func__);
+	if (PRIV_IS_VF(priv))
+		return;
+	if (!test_bit(DN200_SUSPENDED, &priv->state))
+		dn200_suspend(&pdev->dev);
+	dn200_resume(&pdev->dev);
+	clear_bit(DN200_SUSPENDED, &priv->state);
+}
+
+static const struct pci_error_handlers dn200_err_handler = {
+	.error_detected = dn200_pci_error_detected,
+	.slot_reset = dn200_pci_error_slot_reset,
+	.reset_prepare = dn200_pci_error_reset_prepare,
+	.reset_done = dn200_pci_error_reset_done,
+	.resume = dn200_pci_error_resume,
+};
+
+MODULE_DEVICE_TABLE(pci, dn200_id_table);
+
+static struct pci_driver dn200_pci_driver = {
+	.name = DN200_RESOURCE_NAME,
+	.id_table = dn200_id_table,
+	.probe = dn200_pci_probe,
+	.remove = dn200_pci_remove,
+	.driver = {
+		   .pm = &dn200_pm_ops,
+		    },
+	.shutdown = dn200_shutdown,
+	.err_handler = &dn200_err_handler,
+	.sriov_configure = dn200_sriov_configure,
+};
+
+static struct notifier_block dn200_notifier = {
+	.notifier_call = dn200_dev_event,
+};
+
+/**
+ * dn200_module_init - Driver registration routine
+ *
+ * dn200_module_init is the first routine called when the driver is
+ * loaded. All it does is register with the PCI subsystem.
+ */
+static int __init dn200_module_init(void)
+{
+	int status;
+
+	pr_info("%s\n", dn200_driver_str);
+	pr_info("%s\n", dn200_copyright);
+	status = register_netdevice_notifier(&dn200_notifier);
+	if (status) {
+		pr_err("failed to register netdevice_notifier, err %d\n",
+		       status);
+		return status;
+	}
+	status = pci_register_driver(&dn200_pci_driver);
+	if (status)
+		pr_err("failed to register PCI driver, err %d\n", status);
+
+	return status;
+}
+
+module_init(dn200_module_init);
+
+/**
+ * dn200_module_exit - Driver exit cleanup routine
+ *
+ * dn200_module_exit is called just before the driver is removed
+ * from memory.
+ */
+static void __exit dn200_module_exit(void)
+{
+	unregister_netdevice_notifier(&dn200_notifier);
+	pci_unregister_driver(&dn200_pci_driver);
+	pr_info("module unloaded\n");
+}
+
+module_exit(dn200_module_exit);
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_phy.h b/drivers/net/ethernet/dapustor/dn200/dn200_phy.h
new file mode 100644
index 000000000000..f60b72f2b35d
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_phy.h
@@ -0,0 +1,716 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config extern phy or xpcs phy
+ */
+
+#ifndef __DN200_PHY_H__
+#define __DN200_PHY_H__
+
+#define DN200_PXE_USED 0
+#define DN200_JUST_FOR_NORMAL_DRIVER 1
+#define DN200_NORMAL_DRIVER 1
+
+#include <linux/netdevice.h>
+#include "dn200.h"
+
+
+#define DN200_XPCS_BAR_OffSET (0x40200)
+#define IC_READ 0
+#define IC_WRITE 1
+#define DN200_SOFT_AN_LINK_TIMES 20
+
+/* EEPROM (dev_addr = 0xA0) */
+#define DN200_I2C_EEPROM_DEV_ADDR	0x50
+
+#define DN200_SFF_IDENTIFIER		0x00
+#define DN200_SFF_IDENTIFIER_SFP	0x03
+
+#define DN200_SFF_VENDOR_OUI_BYTE0	0x25
+#define DN200_SFF_VENDOR_OUI_BYTE1	0x26
+#define DN200_SFF_VENDOR_OUI_BYTE2	0x27
+
+#define DN200_SFF_1GBE_COMP_CODES	0x06
+#define DN200_SFP_BASE_1GBE_CC_SX		BIT(0)
+#define DN200_SFP_BASE_1GBE_CC_LX		BIT(1)
+#define DN200_SFP_BASE_1GBE_CC_CX		BIT(2)
+#define DN200_SFP_BASE_1GBE_CC_T			BIT(3)
+
+#define DN200_SFF_10GBE_COMP_CODES	0x03
+#define DN200_SFP_BASE_10GBE_CC_SR		BIT(4)
+#define DN200_SFP_BASE_10GBE_CC_LR		BIT(5)
+#define DN200_SFP_BASE_10GBE_CC_LRM		BIT(6)
+#define DN200_SFP_BASE_10GBE_CC_ER		BIT(7)
+
+#define DN200_SFF_CABLE_TECHNOLOGY	0x08
+#define DN200_SFP_BASE_CABLE_PASSIVE		BIT(2)
+#define DN200_SFP_BASE_CABLE_ACTIVE		BIT(3)
+
+#define DN200_SFP_BASE_BR			12
+#define DN200_SFP_BASE_BR_1GBE_MIN		0x0a
+#define DN200_SFP_BASE_BR_10GBE_MIN		0x64
+
+#define DN200_SFF_SFF_8472_SWAP		0x5C
+#define DN200_SFF_ADDRESSING_MODE		BIT(2)
+#define DN200_SFF_DDM_IMPLEMENTED		BIT(6)
+
+#define DN200_SFF_SFF_8472_COMP		0x5E
+#define DN200_SFF_SFF_8472_UNSUP      0x00
+
+#define BLINK_ENABLE 1
+#define BLINK_DISABLE 0
+/* SFF8472 A2 (dev_addr = 0xA2) */
+#define DN200_I2C_SFF8472_DEV_ADDR	0x51
+/* SFF GPIO PINs */
+#define DN200_SFPCTRL_BAROFF 0x840130	// 0x840100-0x84012c is reserved
+
+/**
+ * bit0-tx_disable,		bit1-tx_fault,		bit4-sfp_loss,
+ * bit5-sfp_mod_detect,	bit6-sfp_rs0,		bit7-sfp_rs1,
+ * bit8-led0 from PWM,	bit9-led1 for 1G,	bit10-led2 for 10G
+ */
+#define XGMAC_SFP_DETECT_PIN		5
+#define XGMAC_SFP_TX_DIS_PIN		0
+#define XGMAC_SFP_TX_FAULT_PIN		1
+#define XGMAC_SFP_RX_LOS_PIN		4
+//#define XGMAC_SFP_SDA_PIN                     2
+//#define XGMAC_SFP_SCL_PIN                     3
+#define XGMAC_SFP_RS0_PIN			6
+#define XGMAC_SFP_RS1_PIN			7
+#define XGMAC_SFP_LED1_PIN			9	// low active
+#define XGMAC_SFP_LED2_PIN			10	// low active
+
+/*XPCS C45 MMD REG ADDR*/
+#define SR_PMA_KR_PMD_CTRL	0x96
+#define TR_EN				BIT(1)
+#define TR_EN_S				1
+#define RS_TR				BIT(0)
+#define RS_TR_S				0
+#define SR_PMA_KR_PMD_STS	0x97
+#define SR_PMA_KR_LP_CEU	0x98
+#define LP_PRST				BIT(13)
+#define LP_INIT				BIT(12)
+
+#define SR_PMA_KR_LP_CESTS	0x99
+#define LP_RR				BIT(15)
+#define LP_CFF_STS1			GENMASK(5, 4)
+#define LP_CFF_STS0			GENMASK(3, 2)
+#define LP_CFF_STSM0		GENMASK(1, 0)
+#define SR_PMA_KR_LD_CEU	0x9a
+#define PMA_CTRL1_LB		BIT(1)
+#define SR_PMA_KR_LD_CESTS	0x9b
+#define SR_PMA_KR_FEC_CTRL	0xab
+#define VR_XS_PMA_RX_LSTS	0x8020
+#define VR_XS_PMA_SRAM		0x809b
+#define SRAM_BTLD_BYP		BIT(2)
+#define SRAM_EXT_LD_DN		BIT(1)
+#define VR_TX_GENCTRL		0x8030
+#define TX_RST_0			BIT(8)
+#define TX_DT_EN			BIT(12)
+#define TX_EQ_CTRL0			0x8036
+#define TX_EQ_MAIN			GENMASK(13, 8)
+#define TX_EQ_MAIN_SHIFT	(8)
+#define VR_RX_GENCTRL0			0x8050
+#define RX_DT_EN_0_MASK		BIT(8)
+#define RX_DT_EN_0			BIT(8)
+#define RX_DT_EN_0_S		(8)
+#define VR_RX_GENCTRL1      0x8051
+#define RX_RST_0            BIT(4)
+
+#define VR_PMA_KRTR_TIMER_CTRL1 0x8007
+#define VR_PMA_KRTR_TIMER_CTRL2 0x8008
+#define VR_PMA_KRTR_RX_EQ_CTRL	0x8009
+#define RX_EQ_MM			BIT(15)
+#define RR_RDY				BIT(8)
+#define VR_PMA_KRTR_TX_EQ_STS_CTRL	0x800B
+#define TX_EQ_MM			BIT(15)
+#define VR_PMA_KRTR_TX_EQ_CFF_CTRL	0x800C
+#define VR_PMA_PHY_TX_EQ_STS	0x800D
+
+#define VR_PMA_PHY_RX_EQ_CEU			0x800e
+#define CFF_UPDT_VLD_ALL				GENMASK(10, 8)
+#define CFF_UPDT1_VLD					BIT(10)
+#define CFF_UPDT0_VLD					BIT(9)
+#define CFF_UPDTM1_VLD					BIT(8)
+#define CFF_UPDT1						GENMASK(5, 4)
+#define CFF_UPDT0						GENMASK(3, 2)
+#define CFF_UPDTM1						GENMASK(1, 0)
+#define CFF_UPDTM1_MASK		GENMASK(1, 0)
+#define CFF_UPDT0_MASK		GENMASK(3, 2)
+#define CFF_UPDT1_MASK		GENMASK(5, 4)
+
+#define VR_XS_PMA_MP32G_RXCNTX_CTRL0	0x8092
+#define VR_XS_PMA_MP_12G_16G_25G_MISC_STS 0x8098
+#define RX_ADPT_ACK						BIT(12)
+#define RX_ADPT_ACK_S					12
+#define VR_XS_PMA_MP32G_TXCNTX_CTRL0	0x803E
+#define VR_XS_PMA_MP32G_TXCMCNTX_SEL	0x803C
+#define VR_XS_PMA_MP25G_TXWIDTH_CTRL	0x8046
+#define VR_XS_PMA_MP25G_RXWIDTH_CTRL   0x80B0
+#define VR_XS_PMA_MP25G_TX_EQ_CTRL0		0x8036
+#define DN200_TX_MAIN_SHIFT				8
+#define DN200_TX_MAIN_MASK				GENMASK(13, 8)
+#define DN200_TX_PRE_SHIFT				0
+#define DN200_TX_PRE_MASK				GENMASK(5, 0)
+#define VR_XS_PMA_MP25G_TX_EQ_CTRL1		0x8037
+#define DN200_TX_POST_SHIFT				0
+#define DN200_TX_POST_MASK				GENMASK(5, 0)
+#define DN200_TX_EQ_OVRD				BIT(6)
+#define VR_XS_PMA_MP_32G_RX_EQ_CTRL4	0x805C
+#define RX_AD_REQ						BIT(12)
+#define RX_AD_REQ_S						12
+#define RX_CDR_CTRL						0x8056
+
+/*MMD PCS*/
+#define XS_PCS_STS2_TF	BIT(11)
+#define XS_PCS_STS2_RF	BIT(10)
+#define RST_DUP1			BIT(15)
+#define XS_PCS_LSTS			24
+#define XS_PCS_KR_STS1		32
+#define XPCS_10G_PLU		BIT(12)
+#define RPCS_BKLK			BIT(0)
+#define XS_PCS_KR_STS2		33
+#define VR_XS_PCS_DIG_CTRL1	0x8000
+#define XPCS_PCS_RST				BIT(15)
+#define XPCS_EN_2_5G_MODE	BIT(2)
+#define XPCS_PCS_BYP_PWRUP_DUP1		BIT(1)
+#define VR_XS_PCS_DIG_CTRL2 0x8001
+#define XPCS_PCS_TX_POL_INV GENMASK(7, 4)
+#define XPCS_PCS_RX_POL_INV GENMASK(3, 0)
+#define VR_XS_PCS_DEBUG_CTRL	0x8005
+#define RX_DT_EN_CTL				BIT(6)
+#define SUPRESS_LOS_DET				BIT(4)
+
+/*MMD AN*/
+#define AN_CTRL_AN_EN				BIT(12)
+#define AN_CTRL_AN_EN_S				12
+#define AN_CTRL_AN_RESTART			BIT(9)
+#define AN_CTRL_AN_RESTART_S			9
+
+#define AN_CTRL_LOW_POWER			BIT(11)
+
+#define SR_AN_COMP_STS	0x30
+
+/* SR_AN */
+#define SR_AN_ADV1			0x10
+#define AN_ADV_RF_13_MASK	BIT(13)
+#define AN_ADV_RF_13_SHIFT	13
+#define AAN_ADV_ACK_MASK	BIT(14)
+#define AN_ADV_ACK_SHIFT	14
+#define AN_ADV_NP			BIT(15)
+#define AN_ADV_NP_S			15
+#define SR_AN_ADV2			0x11
+#define SR_AN_ADV3			0x12
+#define KR10G_FEC_ABL		BIT(14)
+#define KR10G_FEC_ABL_S		14
+#define KR10G_FEC_REQ		BIT(15)
+#define KR10G_FEC_REQ_S		15
+#define SR_AN_LP_ABL1		0x13
+#define SR_AN_LP_ABL2		0x14
+#define SR_AN_LP_ABL3		0x15
+#define SR_AN_XNP_TX1		0x16
+#define SR_AN_XNP_TX2		0x17
+#define SR_AN_XNP_TX3		0x18
+#define AN_LP_XNP_ABL1		0x19
+#define VR_AN_DIG_CTRL1		0x8000
+#define CL73_TMR_OVR_RIDE	BIT(3)
+#define CL73_TMR_OVR_RIDE_S	3
+#define VR_AN_INTR_MSK		0x8001
+#define VR_AN_INTR			0x8002
+#define AN_PG_RCV			BIT(2)
+#define AN_INC_LINK			BIT(1)
+#define AN_INT_CMPLT		BIT(0)
+#define VR_AN_TIMER_CTRL0	0x8004
+#define VR_AN_TIMER_CTRL1	0x8005
+
+/* Clause 73 Defines */
+/* AN_LP_ABL1 */
+#define C73_PAUSE			BIT(10)
+#define C73_ASYM_PAUSE		BIT(11)
+#define C73_AN_ADV_SF		0x1
+/* AN_LP_ABL2 */
+#define C73_1000KX			BIT(5)
+#define C73_10000KX4			BIT(6)
+#define C73_10000KR			BIT(7)
+/* AN_LP_ABL3 */
+#define C73_2500KX			BIT(0)
+#define C73_5000KR			BIT(1)
+#define C73_LP_FEC_EN		BIT(14)
+#define C73_NEED_FEC_EN		BIT(15)
+
+/* EEE Mode Control Register */
+#define VR_MII_EEE_MCTRL0              0x8006
+#define VR_MII_EEE_MCTRL1              0x800b
+#define VR_MII_DIG_CTRL2               0x80e1
+/* VR MII EEE Control 0 defines */
+#define VR_MII_EEE_LTX_EN                      BIT(0)	/* LPI Tx Enable */
+#define VR_MII_EEE_LRX_EN                      BIT(1)	/* LPI Rx Enable */
+#define VR_MII_EEE_TX_QUIET_EN         BIT(2)	/* Tx Quiet Enable */
+#define VR_MII_EEE_RX_QUIET_EN         BIT(3)	/* Rx Quiet Enable */
+#define VR_MII_EEE_TX_EN_CTRL          BIT(4)	/* Tx Control Enable */
+#define VR_MII_EEE_RX_EN_CTRL          BIT(7)	/* Rx Control Enable */
+/* VR MII EEE Control 1 defines */
+#define VR_MII_EEE_TRN_LPI             BIT(0)	/* Transparent Mode Enable */
+
+#define VR_MII_EEE_MULT_FACT_100NS_SHIFT       8
+#define VR_MII_EEE_MULT_FACT_100NS             GENMASK(11, 8)
+
+#define SR_MII_CTRL 0x0
+#define AN_ENABLE BIT(12)
+#define AN_ENABLE_SHIFT 12
+#define AN_ENABLE_MASK BIT(12)
+
+#define XPCSCLKENABLE 0x20
+
+enum rx_train_state {
+	RX_EQ_NONE = 0,
+	RX_EQ_SEND_INIT,
+	RX_EQ_WAIT_UPDATE,
+	RX_EQ_SEND_HOLD,
+	RX_EQ_WAIT_NOTUPDATE,
+	RX_EQ_SEND_COF,
+	RX_EQ_POLL_COF,
+	RX_EQ_LD_NOCMD,
+	RX_EQ_READY,
+};
+
+enum tx_train_state {
+	TX_EQ_NONE = 0,
+	TX_EQ_POLL_LP_CMD,
+	TX_EQ_WAIT_LD_VLD,
+	TX_EQ_WAIT_HOLD_CMD,
+	TX_EQ_WAIT_LD_INVLD,
+	TX_EQ_LP_RDY,
+};
+
+struct xpcs_link_down_dump {
+	char *reg_str;
+	u8 dev;
+	u16 reg;
+};
+
+#define GET_BITS(_var, _index, _width) \
+	(((_var) >> (_index)) & ((0x1 << (_width)) - 1))
+
+#define SET_BITS(_var, _index, _width, _val)                                \
+	do {                                                                \
+		__tmp_val = _val; \
+		__tmp_index = _index; \
+		__tmp_width = _width; \
+		__tmp_unsed_var = _var; \
+		(__tmp_val) &= ~(((0x1 << (__tmp_width)) - 1) << (__tmp_index));           \
+		(__tmp_val) |= (((__tmp_val) & ((0x1 << (__tmp_width)) - 1)) << (__tmp_index)); \
+	} while (0)
+
+#define XGE_IOREAD(_pdata, _reg) ioread32((_pdata)->ioaddr 0x20000 + (_reg))
+
+#define XGE_IOWRITE(_pdata, _reg, _val) \
+	iowrite32((_val), (_pdata)->ioaddr + 0x20000 + (_reg))
+
+#define XPCS_GET_BITS(_var, _prefix, _field)				\
+	GET_BITS((_var),                                                \
+		 _prefix##_##_field##_INDEX,                            \
+		 _prefix##_##_field##_WIDTH)
+
+#define XPCS_SET_BITS(_var, _prefix, _field, _val)                      \
+	SET_BITS((_var),                                                \
+		 _prefix##_##_field##_INDEX,                            \
+		 _prefix##_##_field##_WIDTH, (_val))
+
+#define XPCS32_IOWRITE(_pdata, _off, _val)	writel(_val, (_pdata)->xpcs_regs_base + DN200_XPCS_BAR_OffSET + (_off))
+
+#define XPCS32_IOREAD(_pdata, _off)					\
+	readl((_pdata)->xpcs_regs_base + DN200_XPCS_BAR_OffSET + (_off))
+
+enum dn200_media_type {
+	DN200_MEDIA_TYPE_UNKNOWN = 0,
+	DN200_MEDIA_TYPE_XPCS_1000BASEX,
+	DN200_MEDIA_TYPE_XPCS_10GBASEKR,
+	DN200_MEDIA_TYPE_PHY_1000BASEX,
+	DN200_MEDIA_TYPE_PHY_COPPER,
+	DN200_MEDIA_TYPE_VIRTUAL,
+	DN200_MEDIA_TYPE_MAX,
+};
+
+enum dn200_link_status {
+	DN200_LINK_DOWN,
+	DN200_LINK_UP,
+};
+
+enum dn200_duplex_info {
+	DN200_DUP_HALF,
+	DN200_DUP_FULL,
+};
+
+enum dn200_speed_info {
+	DN200_SPEED_UNKOWN,
+	DN200_SPEED_SGMII_10,
+	DN200_SPEED_SGMII_100,
+	DN200_SPEED_SGMII_1000,
+	DN200_SPEED_1000BASEX,
+	DN200_SPEED_10GKR,
+	DN200_SPEED_2500,
+};
+
+enum dn200_sfp_type {
+	DN200_SFP_TYPE_UNKNOWN = 0,
+	DN200_SFP_TYPE_SR,
+	DN200_SFP_TYPE_LR,
+	DN200_SFP_TYPE_NOT_PRESENT = 0XFFFE,
+	DN200_SFP_TYPE_NOT_KNOWN = 0XFFFF
+};
+
+enum dn200_sfp_module_type {
+	DN200_PHY_SFP_MODULE_UNKNOWN = 0,
+	DN200_PHY_SFP_MODULE_AVAGO,
+	DN200_PHY_SFP_MODULE_INTEL,
+	DN200_PHY_SFP_MODULE_GENERIC
+};
+
+#define DN200_SFF_VENDOR_OUI_AVAGO	0x00176A00
+
+enum an_state {
+	DN200_AN_DISABLE = 0,
+	DN200_AN_ENABLE,
+};
+
+enum dn200_sfp_cable {
+	DN200_SFP_CABLE_UNKNOWN = 0,
+	DN200_SFP_CABLE_ACTIVE,
+	DN200_SFP_CABLE_PASSIVE,
+	DN200_SFP_CABLE_FIBRE,
+};
+
+enum dn200_sfp_speed {
+	DN200_SFP_SPEED_UNKNOWN = 0,
+	DN200_SFP_SPEED_100 = BIT(1),
+	DN200_SFP_SPEED_1000 = BIT(2),
+	DN200_SFP_SPEED_10000 = BIT(3),
+};
+
+enum dn200_sfp_speed_type {
+	DN200_SFP_TYPE_SPEED_UNKNOWN = 0,
+	DN200_SFP_TYPE_1000 = BIT(0),
+	DN200_SFP_TYPE_10000 = BIT(1),
+};
+
+enum dn200_sfp_base {
+	DN200_SFP_BASE_UNKNOWN = 0,
+	DN200_SFP_BASE_1000_T = BIT(1),
+	DN200_SFP_BASE_1000_SX = BIT(2),
+	DN200_SFP_BASE_1000_LX = BIT(3),
+	DN200_SFP_BASE_1000_CX = BIT(4),
+	DN200_SFP_BASE_10000_SR = BIT(5),
+	DN200_SFP_BASE_10000_LR = BIT(6),
+	DN200_SFP_BASE_10000_LRM = BIT(7),
+	DN200_SFP_BASE_10000_ER = BIT(8),
+	DN200_SFP_BASE_10000_CR = BIT(9),
+};
+
+static const int dn200_xpcs_usxgmii_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const int dn200_xpcs_10gkr_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const int dn200_xpcs_sgmii_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+enum DN200_MDIO_MODE {
+	DN200_MDIO_MODE_NONE = 0,
+	DN200_MDIO_MODE_CL22,
+	DN200_MDIO_MODE_CL45,
+};
+
+/* Link mode bit operations */
+#define DN200_ZERO_SUP(_ls)		\
+	ethtool_link_ksettings_zero_link_mode((_ls), supported)
+
+#define DN200_SET_SUP(_ls, _mode)	\
+	ethtool_link_ksettings_add_link_mode((_ls), supported, _mode)
+
+#define DN200_CLR_SUP(_ls, _mode)	\
+	ethtool_link_ksettings_del_link_mode((_ls), supported, _mode)
+
+#define DN200_IS_SUP(_ls, _mode)	\
+	ethtool_link_ksettings_test_link_mode((_ls), supported, _mode)
+
+#define DN200_ZERO_ADV(_ls)		\
+	ethtool_link_ksettings_zero_link_mode((_ls), advertising)
+
+#define DN200_SET_ADV(_ls, _mode)	\
+	ethtool_link_ksettings_add_link_mode((_ls), advertising, _mode)
+
+#define DN200_CLR_ADV(_ls, _mode)	\
+	ethtool_link_ksettings_del_link_mode((_ls), advertising, _mode)
+
+#define DN200_ADV(_ls, _mode)		\
+	ethtool_link_ksettings_test_link_mode((_ls), advertising, _mode)
+
+#define DN200_ZERO_LP_ADV(_ls)		\
+	ethtool_link_ksettings_zero_link_mode((_ls), lp_advertising)
+
+#define DN200_SET_LP_ADV(_ls, _mode)	\
+	ethtool_link_ksettings_add_link_mode((_ls), lp_advertising, _mode)
+
+#define DN200_CLR_LP_ADV(_ls, _mode)	\
+	ethtool_link_ksettings_del_link_mode((_ls), lp_advertising, _mode)
+
+#define DN200_LP_ADV(_ls, _mode)		\
+	ethtool_link_ksettings_test_link_mode((_ls), lp_advertising, _mode)
+
+#define DN200_LM_COPY(_dst, _dname, _src, _sname)	\
+	bitmap_copy((_dst)->link_modes._dname,		\
+		    (_src)->link_modes._sname,		\
+		    __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+static inline void dn200_linkmode_and(unsigned long *dst,
+				      const unsigned long *a,
+				      const unsigned long *b)
+{
+	bitmap_and(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline int dn200_linkmode_test_bit(int nr,
+					  const unsigned long *addr)
+{
+	return test_bit(nr, addr);
+}
+
+struct dn200_phy_info;
+
+struct dn200_xpcs_info {
+	int (*xpcs_read)(struct dn200_phy_info *phy_info, u32 addr, u32 devad,
+			 u32 reg);
+	int (*xpcs_write)(struct dn200_phy_info *phy_info, u32 addr, u32 devad,
+			  u32 reg, u32 val);
+	void __iomem *xpcs_regs_base;
+};
+#define DN200_MAX_PHY_DUMP_NUM (ARRAY_SIZE(xpcs_link_down_dump_regs))
+
+/*reset sfp info per 2 seconds*/
+#define DN200_SFP_RESET_TIME msecs_to_jiffies(3 * 1000)
+#define DN200_KT_TRAIN_TIME msecs_to_jiffies(600)
+struct dn200_phy_ops {
+	int (*init)(struct dn200_phy_info *phy_info);
+	int (*set_link)(struct dn200_phy_info *phy_info);
+	int (*start)(struct dn200_phy_info *phy_info);
+	int (*stop)(struct dn200_phy_info *phy_info);
+	int (*reset)(struct dn200_phy_info *phy_info);
+	int (*set_speeds)(struct dn200_phy_info *phy_info);
+	void (*led_control)(struct dn200_phy_info *phy_info, bool enable);
+	void (*blink_control)(struct dn200_phy_info *phy_info,
+			      bool link_status);
+	int (*link_status)(struct dn200_phy_info *phy_info);
+	int (*media_type_get)(struct dn200_phy_info *phy_info);
+	int (*identity)(struct dn200_phy_info *phy_info);
+	int (*an_config)(struct dn200_phy_info *phy_info);
+
+	int (*read_i2c_byte)(struct dn200_phy_info *phy_info, u8 byte_offset,
+			     u8 dev_addr, u8 *data);
+	int (*write_i2c_byte)(struct dn200_phy_info *phy_info, u8 byte_offset,
+			      u8 dev_addr, u8 data);
+	int (*read_i2c_eeprom)(struct dn200_phy_info *phy_info, u8 byte_offset,
+			       u8 *eeprom_data);
+	int (*read_i2c_sff8472)(struct dn200_phy_info *phy_info,
+				u8 byte_offset, u8 *eeprom_data);
+	int (*write_i2c_eeprom)(struct dn200_phy_info *phy_info,
+				u8 byte_offset, u8 eeprom_data);
+	void (*init_phy_timer)(struct dn200_phy_info *phy_info);
+	void (*start_phy_timer)(struct dn200_phy_info *phy_info);
+	void (*stop_phy_timer)(struct dn200_phy_info *phy_info);
+	void (*phy_timer_del)(struct dn200_phy_info *phy_info);
+	int (*set_link_ksettings)(struct net_device *netdev,
+				  const struct ethtool_link_ksettings *cmd);
+	int (*get_link_ksettings)(struct net_device *netdev,
+				  struct ethtool_link_ksettings *cmd);
+	int (*init_eee)(struct dn200_phy_info *phy_info, bool clk_stop_enable);
+	int (*set_eee)(struct dn200_phy_info *phy_info,
+		       struct ethtool_eee *data);
+	int (*get_eee)(struct dn200_phy_info *phy_info,
+		       struct ethtool_eee *data);
+	int (*get_phy_pauseparam)(struct dn200_phy_info *phy_info,
+				  struct ethtool_pauseparam *pause);
+	int (*set_phy_pauseparam)(struct dn200_phy_info *phy_info,
+				  struct ethtool_pauseparam *pause);
+	int (*nway_reset)(struct dn200_phy_info *phy_info);
+	int (*phy_loopback)(struct dn200_phy_info *phy_info, bool enable);
+};
+
+enum dn200_phy_state {
+	DN200_PHY_EMPTY = 0,
+	DN200_PHY_SFP_INITED = 1,
+	DN200_PHY_SFP_NEED_RESET = 2,
+	DN200_PHY_STARTED = 3,
+	DN200_PHY_MULTISPEED_SETUP = 4,
+	DN200_PHY_IN_SFP_INIT = 5,
+	DN200_PHY_IN_RESET = 6,
+	DN200_PHY_IN_TRAIN = 7,
+};
+
+#define DN200_MAX_BLK_ERR_CNT 3
+
+struct dn200_phy_info {
+	/*phy link modes */
+	struct ethtool_link_ksettings lks;
+
+	struct net_device *dev;
+	/*phy ops */
+	const struct dn200_phy_ops *phy_ops;
+	const struct dn200_mac_ops *mac_ops;
+	/*extern phy info */
+	struct mii_bus *mii_bus;
+	struct phy_device *phydev;
+	/*inband xpcs phy info */
+	struct dn200_xpcs_info *xpcs;
+	struct dn200_gpio_data *gpio_data;
+	unsigned long phy_state;
+
+	/*common phy type */
+	enum DN200_MDIO_MODE phydev_mode;
+	enum dn200_media_type media_type;
+	enum an_state an;
+	enum an_state cur_an;
+	bool an_sucess;
+	enum rx_train_state rx_eq_states;
+	enum tx_train_state tx_eq_states;
+	unsigned long tr_timeout;
+	int pause;
+	u32 eee_broken_modes;
+	phy_interface_t phy_interface;
+
+	/*link status */
+	u32 last_link_speed;	/*last link speed state <= setting_speed */
+	u32 speed;		/*current used, <= setting_speed */
+	u32 setting_speed;	/*set by user, <= max_speed */
+	u32 max_speed;		/*set by pcie deviceid */
+	bool self_adap_reset;
+	unsigned long speed_reset_time;
+	u8 port_type;
+	enum dn200_duplex_info dup;
+	enum dn200_link_status link_status;
+	/*gpio base addr */
+	void __iomem *gpio_base;
+
+	u8 phy_addr;
+	u8 xpcs_idx;
+	/*phy sfp info */
+	u8 sfp_type;
+	u8 sfp_id;
+	u8 sfp_module_type;
+
+	u8 xpcs_sfp_valid:1;
+	u8 sfp_has_gpio:1;
+	u8 sfp_setup_needed:1;
+	u8 sfp_rx_los:1;
+	u8 sfp_tx_disable:1;
+	u8 sfp_mod_absent:1;
+	u8 sfp_tx_falut:1;
+	u8 sfp_changed:1;
+	u32 sfp_base;
+	enum dn200_sfp_cable sfp_cable;
+	u32 sfp_speed;
+	bool multispeed_sfp;
+	u8 blk_err_ck;
+	u8 blk_err_cnt;
+	/* Service routine support */
+	struct workqueue_struct *dev_workqueue;
+	struct work_struct phy_status_work;
+	struct work_struct kr_train_work;
+	struct delayed_work phy_multispeed_work;
+	struct timer_list phy_status_timer;
+	u32 phy_status_time_intr;
+	bool phy_loopback_flag;
+	bool mac_debug_active;
+	bool recfg_an;
+	u16 link_modes;
+#define DN300_100BASET_Full BIT(0)
+#define DN300_1000BASET_Full BIT(1)
+#define DN300_1000BASEX_Full BIT(2)
+#define DN300_10000baseSR_Full BIT(3)
+#define DN300_10000baseLRM_Full BIT(4)
+#define DN300_10000baseLR_Full BIT(5)
+#define DN300_10000baseKR_Full BIT(6)
+#define DN300_10000baseCR_Full BIT(7)
+#define DN300_10000baseER_Full BIT(8)
+};
+
+struct dn200_mac_ops {
+	void (*mac_link_down)(struct dn200_priv *config, unsigned int mode,
+			      phy_interface_t interface);
+	void (*mac_link_up)(struct dn200_priv *config,
+			    struct phy_device *phy, unsigned int mode,
+			    phy_interface_t interface, int speed, int duplex,
+			    bool tx_pause, bool rx_pause);
+	void (*mac_speed_set)(struct dn200_priv *priv,
+			      phy_interface_t interface, int speed);
+};
+
+/*already link up, 500ms interval*/
+#define DN200_PHY_STATUS_NINTR (500)
+/*link down, 100ms interval*/
+#define DN200_PHY_STATUS_DINTR (100)
+
+#define DN200_ERR_BASE (0x100)
+
+/* Error Codes */
+#define DN200_ERR_EEPROM			-(DN200_ERR_BASE + 1)
+#define DN200_ERR_EEPROM_CHECKSUM		-(DN200_ERR_BASE + 2)
+#define DN200_ERR_PHY				-(DN200_ERR_BASE + 3)
+#define DN200_ERR_CONFIG			-(DN200_ERR_BASE + 4)
+#define DN200_ERR_PARAM				-(DN200_ERR_BASE + 5)
+#define DN200_ERR_MAC_TYPE			-(DN200_ERR_BASE + 6)
+#define DN200_ERR_UNKNOWN_PHY			-(DN200_ERR_BASE + 7)
+#define DN200_ERR_LINK_SETUP			-(DN200_ERR_BASE + 8)
+#define DN200_ERR_ADAPTER_STOPPED		-(DN200_ERR_BASE + 9)
+#define DN200_ERR_INVALID_MAC_ADDR		-(DN200_ERR_BASE + 10)
+#define DN200_ERR_DEVICE_NOT_SUPPORTED		-(DN200_ERR_BASE + 11)
+#define DN200_ERR_MASTER_REQUESTS_PENDING	-(DN200_ERR_BASE + 12)
+#define DN200_ERR_INVALID_LINK_SETTINGS		-(DN200_ERR_BASE + 13)
+#define DN200_ERR_AUTONEG_NOT_COMPLETE		-(DN200_ERR_BASE + 14)
+#define DN200_ERR_RESET_FAILED			-(DN200_ERR_BASE + 15)
+#define DN200_ERR_SWFW_SYNC			-(DN200_ERR_BASE + 16)
+#define DN200_ERR_PHY_ADDR_INVALID		-(DN200_ERR_BASE + 17)
+#define DN200_ERR_I2C				-(DN200_ERR_BASE + 18)
+#define DN200_ERR_SFP_NOT_SUPPORTED		-(DN200_ERR_BASE + 19)
+#define DN200_ERR_SFP_NOT_PRESENT		-(DN200_ERR_BASE + 20)
+#define DN200_ERR_SFP_NO_INIT_SEQ_PRESENT	-(DN200_ERR_BASE + 21)
+
+int dn200_phy_info_init(struct net_device *dev,
+			const struct dn200_mac_ops *mac_ops);
+void dn200_hw_sideband_init(struct dn200_phy_info *phy_info);
+int dn200_phy_info_remove(struct net_device *dev);
+int dn200_phy_fec_enable(struct net_device *dev, bool enable);
+int dn200_xpcs_config_eee(struct dn200_phy_info *phy_info, int mult_fact_100ns,
+			  int enable);
+irqreturn_t dn200_phy_status_isr(int irq, void *dev_id);
+int dn200_phy_clock_stable_judge(struct dn200_phy_info *phy_info);
+#define PRIV_PHY_OPS_CHECK(priv) \
+	((priv)->plat_ex->phy_info ? !!(priv)->plat_ex->phy_info->phy_ops : 0)
+#define PRIV_PHY_INFO(priv) ((priv)->plat_ex->phy_info)
+#define PRIV_PHY_OPS(priv) ((priv)->plat_ex->phy_info->phy_ops)
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_phy_impl.c b/drivers/net/ethernet/dapustor/dn200/dn200_phy_impl.c
new file mode 100644
index 000000000000..91778d6d8dfc
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_phy_impl.c
@@ -0,0 +1,4227 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config extern phy or xpcs phy
+ */
+#include "dn200_phy.h"
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/mdio.h>
+#include <linux/phy.h>
+#include <linux/phylink.h>
+#include <linux/iopoll.h>
+#include "dn200_self.h"
+#include "dn200_sriov.h"
+#include "dn200_self.h"
+#include "mmc.h"
+
+#pragma GCC diagnostic ignored "-Warray-bounds"
+static DEFINE_MUTEX(dn200_gpio_mutex);
+static DEFINE_SPINLOCK(dn200_xpcs_lock);
+struct xpcs_link_down_dump xpcs_link_down_dump_regs[] = {
+	/*DUMP PMA_PMD STATUS REG INFO */
+	{ "SR_PMA_STATUS1", MDIO_MMD_PMAPMD, MDIO_STAT1 },	//0x40004
+	{ "SR_PMA_STATUS2", MDIO_MMD_PMAPMD, MDIO_STAT2 },	//0x40020
+	{ "SR_PMA_RX_SIG_DET", MDIO_MMD_PMAPMD, MDIO_PMA_RXDET },	//0x40040
+
+	/*DUMP PCS STATUS REG INFO */
+	{ "SR_XS_PCS_STS1", MDIO_MMD_PCS, MDIO_STAT1 },	//0xc0004
+	{ "SR_XS_PCS_STS2", MDIO_MMD_PCS, MDIO_STAT2 },	//0xc0020
+	{ "SR_XS_PCS_LSTS", MDIO_MMD_PCS, MDIO_PHYXS_LNSTAT },	//0xc0060
+	{ "SR_XS_PCS_KR_STS1", MDIO_MMD_PCS, XS_PCS_KR_STS1 },	//0xc0080
+	{ "SR_XS_PCS_KR_STS2", MDIO_MMD_PCS, XS_PCS_KR_STS2 },	//0xc0084
+
+	/*DUMP SERDES STATUS REG INFO */
+	{ "VR_XS_PMA_RX_LSTS", MDIO_MMD_PMAPMD, VR_XS_PMA_RX_LSTS },	//0x60080
+
+	/*DUMP AN STATUS REG INFO */
+	{ "AN CTRL1", MDIO_MMD_AN, MDIO_CTRL1 },
+	{ "AN STAT2", MDIO_MMD_AN, MDIO_STAT2 },
+	{ "AN COMP_STS", MDIO_MMD_AN, SR_AN_COMP_STS },
+};
+
+static int dn200_phy_info_state_change(struct dn200_phy_info *phy_info);
+static void dn200_phy_print_status(struct dn200_phy_info *phy_info);
+static int dn200_xpcs_read(struct dn200_phy_info *phy_info, u32 addr, u32 devad,
+			   u32 reg)
+{
+	u32 phy_reg = 0;
+	u32 val = 0;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	int ret = 0;
+
+	if (devad == MDIO_DEVAD_NONE) {
+		// do nothing
+		return 0xffff;
+	}
+	phy_reg |= (devad & 0x1F) << 18;
+	phy_reg |= (reg & 0xFFFF) << 2;
+	if (phy_reg >= 0xc0000 && phy_reg <= 0xc00ff) {
+		phy_reg -= 0xc0000;
+		val = XPCS32_IOREAD(phy_info->xpcs, phy_reg);
+	} else {
+		ret = fw_reg_read(&priv->plat_ex->ctrl, 0x2c000000 + 0x800000 * phy_info->xpcs_idx + phy_reg, &val);
+		if (ret < 0) {
+			dev_err(priv->device, "(%s) read xpcs apb address=%08x val=%08x faile! err %d\n",
+				__func__, phy_reg, val, ret);
+		}
+	}
+	dev_dbg(priv->device, "(%s) xpcs apb address=%08x val=%08x\n",
+		__func__, phy_reg, val);
+	return val;
+}
+
+static int dn200_xpcs_write(struct dn200_phy_info *phy_info, u32 addr,
+			    u32 devad, u32 reg, u32 val)
+{
+	u32 phy_reg = 0;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	int ret = 0;
+
+	if (devad == MDIO_DEVAD_NONE) {
+		// do nothing
+	} else {
+		phy_reg |= (devad & 0x1F) << 18;
+		phy_reg |= (reg & 0xFFFF) << 2;
+		dev_dbg(priv->device,
+			"(%s) write %08x to xpcs apb address %08x\n", __func__,
+			val, phy_reg);
+		if (phy_reg >= 0xc0000 && phy_reg <= 0xc00ff) {
+			phy_reg -= 0xc0000;
+			XPCS32_IOWRITE(phy_info->xpcs, phy_reg, val);
+		} else  {
+			ret = fw_reg_write(&priv->plat_ex->ctrl, 0x2c000000 + 0x800000 * phy_info->xpcs_idx + phy_reg, val);
+			if (ret < 0)
+				dev_err(priv->device, "(%s) write %08x to xpcs apb address %08x failed! ret %d\n", __func__,
+					val, phy_reg, ret);
+		}
+	}
+	return 0;
+}
+
+static int dn200_xpcs_set_bit(struct dn200_phy_info *phy_info, u32 addr,
+			      u32 devad, u32 reg, u32 mask, u32 shift, u32 val)
+{
+	u16 reg_val = 0;
+
+	if (devad == MDIO_DEVAD_NONE) {
+		// do nothing
+	} else {
+		reg_val = dn200_xpcs_read(phy_info, addr, devad, reg);
+		reg_val &= ~(mask);
+		reg_val |= (val << shift);
+		dn200_xpcs_write(phy_info, addr, devad, reg, reg_val);
+	}
+	return 0;
+}
+
+static int dn200_phy_read(struct dn200_phy_info *phy_info, int devad, int reg)
+{
+	if (phy_info->phydev)
+		return __phy_read(phy_info->phydev, reg);
+	else if (phy_info->xpcs)
+		return phy_info->xpcs->xpcs_read(phy_info, phy_info->phy_addr,
+						 devad, reg);
+	return 0;
+}
+
+static int __maybe_unused dn200_phy_write(struct dn200_phy_info *phy_info,
+					  int devad, int reg, u32 val)
+{
+	if (phy_info->phydev)
+		return __phy_write(phy_info->phydev, reg, val);
+	else if (phy_info->xpcs)
+		return phy_info->xpcs->xpcs_write(phy_info, phy_info->phy_addr,
+						  devad, reg, val);
+	return 0;
+}
+
+static u16 dn200_xpcs_phy_reg_read(struct dn200_phy_info *phy_info, u8 dev,
+				   u32 reg)
+{
+	u32 reg_result = 0;
+
+	reg_result |= (((dev & 0xff) << 24) | ((reg & 0xff) << 16));
+	reg_result |= (dn200_phy_read(phy_info, dev, reg) & 0xffff);
+	return reg_result;
+}
+
+static int dn200_extern_phy_identity(struct dn200_phy_info *phy_info)
+{
+	u32 id = 0;
+	int ret = 0;
+
+	/* First, search C73 PCS using PCS MMD */
+	ret = dn200_phy_read(phy_info, MDIO_MMD_PCS, MII_PHYSID1);
+	if (ret < 0)
+		return 0xffffffff;
+
+	id = (u32) ret << 16;
+
+	ret = dn200_phy_read(phy_info, MDIO_MMD_PCS, MII_PHYSID2);
+	if (ret < 0)
+		return 0xffffffff;
+	id |= (u32) ret;
+
+	return id;
+}
+
+static void dn200_phy_sfp_present(struct dn200_phy_info *phy_info);
+static int dn200_i2c_xfer(struct dn200_phy_info *phy_info,
+				       u8 byte_offset, u8 dev_addr, u8 command, u8 *data)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	int retry = 0;
+	int ret = 0;
+
+	while (retry < 5) {
+		ret = dn200_fw_i2c_rw_commit(&priv->plat_ex->ctrl, dev_addr, data, byte_offset, command);
+		if (ret <= 0)
+			return ret;
+		if (ret == 1) { /*read or write timeout*/
+			dn200_phy_sfp_present(phy_info);
+			if (phy_info->sfp_mod_absent) {
+				netdev_warn(phy_info->dev, "sfp not present\n");
+				return ret;
+			}
+		}
+		retry++;
+	}
+	netdev_err(phy_info->dev, "i2c command fail ret = %d\n", ret);
+	return ret;
+}
+static int dn200_generic_i2c_byte_read(struct dn200_phy_info *phy_info,
+				       u8 byte_offset, u8 dev_addr, u8 *data)
+{
+	int ret = 0;
+
+	ret = dn200_i2c_xfer(phy_info, byte_offset, dev_addr, IC_READ, data);
+	return ret;
+}
+
+static int dn200_generic_i2c_byte_write(struct dn200_phy_info *phy_info,
+					u8 byte_offset, u8 dev_addr, u8 data)
+{
+	int ret = 0;
+
+	ret = dn200_i2c_xfer(phy_info, byte_offset, dev_addr, IC_WRITE, &data);
+	return ret;
+}
+
+static int dn200_i2c_read_eeprom(struct dn200_phy_info *phy_info,
+				 u8 byte_offset, u8 *eeprom_data)
+{
+	return phy_info->phy_ops->read_i2c_byte(phy_info, byte_offset,
+						DN200_I2C_EEPROM_DEV_ADDR,
+						eeprom_data);
+}
+
+static int dn200_i2c_read_sff8472(struct dn200_phy_info *phy_info,
+				  u8 byte_offset, u8 *eeprom_data)
+{
+	return phy_info->phy_ops->read_i2c_byte(phy_info, byte_offset,
+						DN200_I2C_SFF8472_DEV_ADDR,
+						eeprom_data);
+}
+
+static int dn200_i2c_write_eeprom(struct dn200_phy_info *phy_info,
+				  u8 byte_offset, u8 eeprom_data)
+{
+	return phy_info->phy_ops->write_i2c_byte(phy_info, byte_offset,
+						 DN200_I2C_EEPROM_DEV_ADDR,
+						 eeprom_data);
+}
+
+static void dn200_gpio_iowrite(struct dn200_phy_info *phy_info, int offset_val,
+			       u8 val)
+{
+	u32 value = 0;
+	u8 offset_pin = 0;
+	u16 offset = offset_val & 0xffff;
+
+	mutex_lock(&dn200_gpio_mutex);
+	if ((offset_val >> 16) & 0x1)
+		val = (~val) & 0x1;
+	/*may be offset is greater than 31 */
+	offset_pin = (offset >> 5) << 2;
+	offset = offset & 31;
+	value =
+	    ioread32(phy_info->gpio_base + offset_pin +
+		     phy_info->gpio_data->reg_off_set_write);
+	value &= ~BIT(offset);
+	value |= val << offset;
+	iowrite32(value,
+		  phy_info->gpio_base + offset_pin +
+		  phy_info->gpio_data->reg_off_set_write);
+
+	mutex_unlock(&dn200_gpio_mutex);
+}
+
+static u8 dn200_gpio_ioread(struct dn200_phy_info *phy_info, int offset_val)
+{
+	u32 value = 0;
+	u8 offset_pin = 0;
+	u8 val = 0;
+	u16 offset = offset_val & 0xffff;
+
+	mutex_lock(&dn200_gpio_mutex);
+
+	/*may be offset is greater than 31 */
+	offset_pin = (offset >> 5) << 2;
+	offset = offset & 31;
+	value =
+	    ioread32(phy_info->gpio_base + offset_pin +
+		     phy_info->gpio_data->reg_off_set_read);
+	val = (value >> (offset)) & 0x1;
+	if ((offset_val >> 16) & 0x1)
+		val = (~val) & 0x1;
+	mutex_unlock(&dn200_gpio_mutex);
+
+	return val;
+}
+
+static void dn200_phy_sfp_present(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->sfp_has_gpio) {
+		phy_info->sfp_mod_absent =
+		    dn200_gpio_ioread(phy_info,
+				      phy_info->gpio_data->sfp_detect_pin);
+	}
+}
+
+static void dn200_phy_sfp_rx_los(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->sfp_has_gpio) {
+		phy_info->sfp_rx_los =
+		    dn200_gpio_ioread(phy_info,
+				      phy_info->gpio_data->sfp_rx_los_pin);
+	}
+}
+
+static void dn200_phy_sfp_tx_falut(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->sfp_has_gpio) {
+		phy_info->sfp_tx_falut =
+		    dn200_gpio_ioread(phy_info,
+				      phy_info->gpio_data->sfp_tx_fault_pin);
+	}
+}
+
+static void dn200_phy_set_sfp_tx_disable(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->sfp_has_gpio) {
+		dn200_gpio_iowrite(phy_info,
+				   phy_info->gpio_data->sfp_tx_disable_pin,
+				   phy_info->sfp_tx_disable);
+	}
+}
+
+static void dn200_phy_set_rs_mode(struct dn200_phy_info *phy_info, bool high)
+{
+	if (high) {
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_rs0_pin,
+				   1);
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_rs1_pin,
+				   1);
+	} else {
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_rs0_pin,
+				   0);
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_rs1_pin,
+				   0);
+	}
+}
+
+static void dn200_phy_set_led(struct dn200_phy_info *phy_info, bool on)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (phy_info->phydev) {
+		if (!on) {
+			extern_phy_force_led(phy_info->phydev, priv, 0, 1);
+			extern_phy_force_led(phy_info->phydev, priv, 1, 1);
+			return;
+		}
+		if (phy_info->speed == SPEED_1000)
+			extern_phy_force_led(phy_info->phydev, priv, 1, 0);
+		else if (phy_info->speed == SPEED_100)
+			extern_phy_force_led(phy_info->phydev, priv, 2, 0);
+		else
+			extern_phy_force_led(phy_info->phydev, priv, 1, 1);
+
+		extern_phy_force_led(phy_info->phydev, priv, 0, 0);
+		return;
+	}
+	if (on) {
+		if (phy_info->speed == SPEED_10000) {
+			dn200_gpio_iowrite(phy_info,
+					   phy_info->gpio_data->sfp_led1_pin,
+					   1);
+			dn200_gpio_iowrite(phy_info,
+					   phy_info->gpio_data->sfp_led2_pin,
+					   0);
+		} else if (phy_info->speed == SPEED_1000) {
+			dn200_gpio_iowrite(phy_info,
+					   phy_info->gpio_data->sfp_led2_pin,
+					   1);
+			dn200_gpio_iowrite(phy_info,
+					   phy_info->gpio_data->sfp_led1_pin,
+					   0);
+		}
+	} else {
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_led2_pin,
+				   1);
+		dn200_gpio_iowrite(phy_info, phy_info->gpio_data->sfp_led1_pin,
+				   1);
+	}
+}
+
+static void dn200_blink_control(struct dn200_phy_info *phy_info,
+				bool link_status)
+{
+	u64 tmp_rx = 0;
+	u64 tmp_tx = 0;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	dwxgmac_read_mmc_reg(priv->mmcaddr, MMC_XGMAC_RX_PKT_GB, &tmp_rx);
+	dwxgmac_read_mmc_reg(priv->mmcaddr, MMC_XGMAC_TX_PKT_GB, &tmp_tx);
+	priv->mmc.mmc_rx_framecount_gb += tmp_rx;
+	priv->mmc.mmc_tx_framecount_gb += tmp_tx;
+	if ((tmp_rx || tmp_tx) && link_status) {
+		if (!priv->blink_state_last) {
+			if (phy_info->phydev)
+				extern_phy_force_led(phy_info->phydev, priv, 0, 2);
+			else
+				dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_ENABLE);
+			priv->blink_state_last = BLINK_ENABLE;
+		}
+	} else {
+		if (priv->blink_state_last) {
+			if (phy_info->phydev)
+				extern_phy_force_led(phy_info->phydev, priv, 0, !phy_info->phydev->link);
+			else
+				dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_DISABLE);
+			priv->blink_state_last = BLINK_DISABLE;
+		}
+	}
+}
+
+static int dn200_sfp_module_identify(struct dn200_phy_info *phy_info)
+{
+	s32 err = DN200_ERR_PHY_ADDR_INVALID;
+	u32 vendor_oui = 0;
+	u8 identifier = 0;
+	u8 oui_bytes[3] = { 0, 0, 0 };
+
+	dn200_phy_sfp_present(phy_info);
+	if (phy_info->sfp_mod_absent)
+		return DN200_ERR_SFP_NOT_PRESENT;
+
+	if (phy_info->media_type != DN200_MEDIA_TYPE_XPCS_1000BASEX &&
+	    phy_info->media_type != DN200_MEDIA_TYPE_XPCS_10GBASEKR) {
+		phy_info->sfp_type = (u8) DN200_SFP_TYPE_NOT_PRESENT;
+		return DN200_ERR_SFP_NOT_PRESENT;
+	}
+	err = phy_info->phy_ops->read_i2c_eeprom(phy_info, DN200_SFF_IDENTIFIER,
+						 &identifier);
+	if (err != 0) {
+ERR_I2C:
+		dn200_phy_sfp_present(phy_info);
+		if (!phy_info->sfp_mod_absent)
+			netdev_warn(phy_info->dev, "sfp not present\n");
+		phy_info->sfp_type = (u8) DN200_SFP_TYPE_NOT_PRESENT;
+		return DN200_ERR_SFP_NOT_PRESENT;
+	}
+
+	if (identifier != DN200_SFF_IDENTIFIER_SFP) {
+		netdev_warn(phy_info->dev, "sfp not support, identifier %d\n", identifier);
+		phy_info->sfp_type = (u8) DN200_SFP_TYPE_UNKNOWN;
+		return DN200_ERR_SFP_NOT_SUPPORTED;
+	}
+
+	/* Determine SFF module vendor */
+	err = phy_info->phy_ops->read_i2c_eeprom(phy_info,
+						 DN200_SFF_VENDOR_OUI_BYTE0,
+						 &oui_bytes[0]);
+	if (err != 0)
+		goto ERR_I2C;
+
+	err = phy_info->phy_ops->read_i2c_eeprom(phy_info,
+						 DN200_SFF_VENDOR_OUI_BYTE1,
+						 &oui_bytes[1]);
+	if (err != 0)
+		goto ERR_I2C;
+
+	err = phy_info->phy_ops->read_i2c_eeprom(phy_info,
+						 DN200_SFF_VENDOR_OUI_BYTE2,
+						 &oui_bytes[2]);
+	if (err != 0)
+		goto ERR_I2C;
+
+	vendor_oui = ((u32)oui_bytes[0] << 24) |
+	    ((u32)oui_bytes[1] << 16) | ((u32)oui_bytes[2] << 8);
+	switch (vendor_oui) {
+	case DN200_SFF_VENDOR_OUI_AVAGO:
+		phy_info->sfp_module_type = DN200_PHY_SFP_MODULE_AVAGO;
+		break;
+	default:
+		phy_info->sfp_module_type = DN200_PHY_SFP_MODULE_UNKNOWN;
+		break;
+	}
+	phy_info->sfp_id = DN200_SFF_IDENTIFIER_SFP;
+	return err;
+}
+
+static void dn200_phy_sfp_reset(struct dn200_phy_info *phy_info)
+{
+	phy_info->sfp_rx_los = 0;
+	phy_info->sfp_tx_disable = 0;
+	dn200_phy_sfp_present(phy_info);
+	phy_info->sfp_base = DN200_SFP_BASE_UNKNOWN;
+	phy_info->sfp_cable = DN200_SFP_CABLE_UNKNOWN;
+	phy_info->sfp_speed = DN200_SFP_SPEED_UNKNOWN;
+	phy_info->multispeed_sfp = false;
+}
+
+static bool dn200_phy_sfp_bit_rate(struct dn200_phy_info *phy_info,
+				   u32 sfp_speed)
+{
+	u8 min;
+	u8 sfp_base_br;
+
+	phy_info->phy_ops->read_i2c_eeprom(phy_info, DN200_SFP_BASE_BR,
+					   &sfp_base_br);
+
+	switch (sfp_speed) {
+	case SPEED_1000:
+		min = DN200_SFP_BASE_BR_1GBE_MIN;
+		break;
+	case SPEED_10000:
+		min = DN200_SFP_BASE_BR_10GBE_MIN;
+		break;
+	default:
+		return false;
+	}
+
+	return sfp_base_br >= min;
+}
+
+static void dn200_clear_lks(struct dn200_phy_info *phy_info, int type, bool is_sup);
+static void dn200_link_status_reset(struct dn200_phy_info *phy_info, bool enable);
+static void dn200_xpcs_switch_to_10G(struct dn200_phy_info *phy_info);
+static int dn200_phy_sfp_detect(struct dn200_phy_info *phy_info)
+{
+	u8 sff_cable, sff_10g_comp, sff_1g_comp;
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	u32 sfp_speed = phy_info->sfp_speed;
+
+	/* Reset the SFP signals and info */
+	dn200_phy_sfp_reset(phy_info);
+	/* Read the SFP signals and check for module presence */
+	dn200_phy_sfp_present(phy_info);
+	if (phy_info->sfp_mod_absent)
+		goto put;
+
+	dn200_clear_lks(phy_info, DN200_SFP_TYPE_1000 | DN200_SFP_TYPE_10000, true);
+	phy_info->link_modes = 0;
+	phy_info->phy_ops->read_i2c_eeprom(phy_info, DN200_SFF_CABLE_TECHNOLOGY,
+					   &sff_cable);
+	/* Assume FIBRE cable unless told otherwise */
+	if (sff_cable & DN200_SFP_BASE_CABLE_PASSIVE)
+		phy_info->sfp_cable = DN200_SFP_CABLE_PASSIVE;
+	else if (sff_cable & DN200_SFP_BASE_CABLE_ACTIVE)
+		phy_info->sfp_cable = DN200_SFP_CABLE_ACTIVE;
+	else
+		phy_info->sfp_cable = DN200_SFP_CABLE_FIBRE;
+
+	/* Determine the type of SFP */
+	phy_info->phy_ops->read_i2c_eeprom(phy_info, DN200_SFF_10GBE_COMP_CODES,
+					   &sff_10g_comp);
+	phy_info->phy_ops->read_i2c_eeprom(phy_info, DN200_SFF_1GBE_COMP_CODES,
+					   &sff_1g_comp);
+
+	/*10G mode */
+	if (phy_info->sfp_cable != DN200_SFP_CABLE_FIBRE &&
+	    dn200_phy_sfp_bit_rate(phy_info, SPEED_10000))
+		phy_info->sfp_base |= DN200_SFP_BASE_10000_CR;
+	else if (sff_10g_comp & DN200_SFP_BASE_10GBE_CC_SR)
+		phy_info->sfp_base |= DN200_SFP_BASE_10000_SR;
+	else if (sff_10g_comp & DN200_SFP_BASE_10GBE_CC_LR)
+		phy_info->sfp_base |= DN200_SFP_BASE_10000_LR;
+	else if (sff_10g_comp & DN200_SFP_BASE_10GBE_CC_LRM)
+		phy_info->sfp_base |= DN200_SFP_BASE_10000_LRM;
+	else if (sff_10g_comp & DN200_SFP_BASE_10GBE_CC_ER)
+		phy_info->sfp_base |= DN200_SFP_BASE_10000_ER;
+
+	/*check sfp module 1G mode */
+	if (sff_1g_comp & DN200_SFP_BASE_1GBE_CC_SX)
+		phy_info->sfp_base |= DN200_SFP_BASE_1000_SX;
+	else if (sff_1g_comp & DN200_SFP_BASE_1GBE_CC_LX)
+		phy_info->sfp_base |= DN200_SFP_BASE_1000_LX;
+	else if (sff_1g_comp & DN200_SFP_BASE_1GBE_CC_CX)
+		phy_info->sfp_base |= DN200_SFP_BASE_1000_CX;
+	else if (sff_1g_comp & DN200_SFP_BASE_1GBE_CC_T)
+		phy_info->sfp_base |= DN200_SFP_BASE_1000_T;
+
+	switch (phy_info->sfp_base) {
+	case DN200_SFP_BASE_1000_T:
+		DN200_SET_SUP(lks, 1000baseT_Full);
+		DN200_SET_SUP(lks, 100baseT_Full);
+		phy_info->sfp_speed =
+		    (DN200_SFP_SPEED_1000 | DN200_SFP_SPEED_100);
+		phy_info->link_modes |= DN300_100BASET_Full | DN300_1000BASET_Full;
+		break;
+	case DN200_SFP_BASE_1000_SX:
+		DN200_SET_SUP(lks, 1000baseX_Full);
+		phy_info->link_modes |= DN300_1000BASET_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_1000;
+		break;
+	case DN200_SFP_BASE_1000_LX:
+	case DN200_SFP_BASE_1000_CX:
+		DN200_SET_SUP(lks, 1000baseX_Full);
+		phy_info->link_modes |= DN300_1000BASET_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_1000;
+		break;
+	case DN200_SFP_BASE_10000_SR:
+		DN200_SET_SUP(lks, 10000baseSR_Full);
+		phy_info->link_modes |= DN300_10000baseSR_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_10000;
+		break;
+	case DN200_SFP_BASE_10000_LR:
+		DN200_SET_SUP(lks, 10000baseLR_Full);
+		phy_info->link_modes |= DN300_10000baseLR_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_10000;
+		break;
+	case DN200_SFP_BASE_10000_LRM:
+		DN200_SET_SUP(lks, 10000baseLRM_Full);
+		phy_info->link_modes |= DN300_10000baseLRM_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_10000;
+		break;
+	case DN200_SFP_BASE_10000_ER:
+		DN200_SET_SUP(lks, 10000baseER_Full);
+		phy_info->link_modes |= DN300_10000baseER_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_10000;
+		break;
+	case DN200_SFP_BASE_10000_CR:
+		DN200_SET_SUP(lks, 10000baseCR_Full);
+		phy_info->link_modes |= DN300_10000baseCR_Full;
+		phy_info->sfp_speed |= DN200_SFP_SPEED_10000;
+		break;
+	case DN200_SFP_BASE_10000_SR | DN200_SFP_BASE_1000_SX:
+		phy_info->link_modes |= DN300_10000baseSR_Full | DN300_1000BASEX_Full;
+		DN200_SET_SUP(lks, 10000baseSR_Full);
+		DN200_SET_SUP(lks, 1000baseX_Full);
+		phy_info->sfp_speed =
+		    DN200_SFP_SPEED_10000 | DN200_SFP_SPEED_1000;
+		break;
+	case DN200_SFP_BASE_10000_LR | DN200_SFP_BASE_1000_LX:
+		phy_info->link_modes |= DN300_10000baseLR_Full | DN300_1000BASEX_Full;
+		DN200_SET_SUP(lks, 10000baseLR_Full);
+		DN200_SET_SUP(lks, 1000baseX_Full);
+		phy_info->sfp_speed =
+		    DN200_SFP_SPEED_10000 | DN200_SFP_SPEED_1000;
+		break;
+	case DN200_SFP_BASE_10000_CR | DN200_SFP_BASE_1000_CX:
+		phy_info->link_modes |= DN300_10000baseCR_Full | DN300_1000BASEX_Full;
+		DN200_SET_SUP(lks, 10000baseCR_Full);
+		DN200_SET_SUP(lks, 1000baseX_Full);
+		phy_info->sfp_speed =
+		    DN200_SFP_SPEED_10000 | DN200_SFP_SPEED_1000;
+		break;
+	default:
+		break;
+	}
+
+	if ((phy_info->sfp_speed & DN200_SFP_SPEED_10000)
+	    && (phy_info->sfp_speed & DN200_SFP_SPEED_1000)) {
+		phy_info->multispeed_sfp = true;
+	}
+
+	if (sfp_speed != phy_info->sfp_speed && sfp_speed != DN200_SFP_SPEED_UNKNOWN)
+		phy_info->sfp_changed = true;
+	bitmap_copy(lks->link_modes.advertising,
+		    lks->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	if (phy_info->speed == SPEED_1000) {
+		dn200_clear_lks(phy_info, DN200_SFP_TYPE_10000, false);
+		phy_info->link_modes = phy_info->link_modes & 0x7;
+	}
+	set_bit(DN200_PHY_SFP_INITED, &phy_info->phy_state);
+	clear_bit(DN200_PHY_SFP_NEED_RESET, &phy_info->phy_state);
+put:
+	return 0;
+}
+
+static int dn200_xpcs_link_down_reg_dump(struct dn200_phy_info *phy_info)
+{
+	u8 max_len = DN200_MAX_PHY_DUMP_NUM;
+	int i = 0;
+	u16 reg_info;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (!phy_info->xpcs)
+		return 0;
+
+	for (; i < max_len; i++) {
+		reg_info =
+		    dn200_xpcs_phy_reg_read(phy_info,
+					    xpcs_link_down_dump_regs[i].dev,
+					    xpcs_link_down_dump_regs[i].reg);
+		dev_info(priv->device,
+			 "%s dev (%#06x) reg (%#06x) offset (%#06x) value (%#06x)\n",
+			 xpcs_link_down_dump_regs[i].reg_str,
+			 xpcs_link_down_dump_regs[i].dev,
+			 xpcs_link_down_dump_regs[i].reg,
+			 (xpcs_link_down_dump_regs[i].dev << 18) | (xpcs_link_down_dump_regs[i].reg << 2),
+			 reg_info);
+	}
+
+	return 0;
+}
+
+int dn200_xpcs_config_eee(struct dn200_phy_info *phy_info, int mult_fact_100ns,
+			  int enable)
+{
+	int ret;
+
+	ret = dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS, VR_MII_EEE_MCTRL0);
+	if (ret < 0)
+		return ret;
+
+	if (enable) {
+		/* Enable EEE */
+		ret = VR_MII_EEE_LTX_EN | VR_MII_EEE_LRX_EN |
+		    VR_MII_EEE_TX_QUIET_EN | VR_MII_EEE_RX_QUIET_EN |
+		    VR_MII_EEE_TX_EN_CTRL | VR_MII_EEE_RX_EN_CTRL |
+		    mult_fact_100ns << VR_MII_EEE_MULT_FACT_100NS_SHIFT;
+	} else {
+		ret &= ~(VR_MII_EEE_LTX_EN | VR_MII_EEE_LRX_EN |
+			 VR_MII_EEE_TX_QUIET_EN | VR_MII_EEE_RX_QUIET_EN |
+			 VR_MII_EEE_TX_EN_CTRL | VR_MII_EEE_RX_EN_CTRL |
+			 VR_MII_EEE_MULT_FACT_100NS);
+	}
+
+	ret =
+	    dn200_xpcs_write(phy_info, 0, MDIO_MMD_PCS, VR_MII_EEE_MCTRL0, ret);
+	if (ret < 0)
+		return ret;
+
+	ret = dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS, VR_MII_EEE_MCTRL1);
+	if (ret < 0)
+		return ret;
+
+	if (enable)
+		ret |= VR_MII_EEE_TRN_LPI;
+	else
+		ret &= ~VR_MII_EEE_TRN_LPI;
+
+	return dn200_xpcs_write(phy_info, 0, MDIO_MMD_PCS, VR_MII_EEE_MCTRL1,
+				ret);
+}
+
+static int dn200_ethtool_get_eee(struct dn200_phy_info *phy_info,
+				 struct ethtool_eee *data)
+{
+	int val;
+
+	if (phy_info->phydev) {
+		return phy_ethtool_get_eee(phy_info->phydev, data);
+	} else if (phy_info->xpcs) {
+		/* Get Supported EEE */
+		val =
+		    dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS,
+				    MDIO_PCS_EEE_ABLE);
+		if (val < 0)
+			return val;
+		data->supported = mmd_eee_cap_to_ethtool_sup_t(val);
+
+		/* Get advertisement EEE */
+		val =
+		    dn200_xpcs_read(phy_info, 0, MDIO_MMD_AN, MDIO_AN_EEE_ADV);
+		if (val < 0)
+			return val;
+		data->advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+		data->eee_enabled = !!data->advertised;
+
+		/* Get LP advertisement EEE */
+		val =
+		    dn200_xpcs_read(phy_info, 0, MDIO_MMD_AN,
+				    MDIO_AN_EEE_LPABLE);
+		if (val < 0)
+			return val;
+		data->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+
+		data->eee_active = !!(data->advertised & data->lp_advertised);
+
+		return 0;
+	}
+	return 0;
+}
+static int dn200_xpcs_an_config(struct dn200_phy_info *phy_info);
+static int dn200_ethtool_set_eee(struct dn200_phy_info *phy_info,
+				 struct ethtool_eee *data)
+{
+	int cap, old_adv, adv = 0, ret;
+
+	if (phy_info->phydev) {
+		return phy_ethtool_set_eee(phy_info->phydev, data);
+	} else if (phy_info->xpcs) {
+		/* Get Supported EEE */
+		cap =
+		    dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS,
+				    MDIO_PCS_EEE_ABLE);
+		if (cap < 0)
+			return cap;
+
+		old_adv =
+		    dn200_xpcs_read(phy_info, 0, MDIO_MMD_AN, MDIO_AN_EEE_ADV);
+		if (old_adv < 0)
+			return old_adv;
+
+		if (data->eee_enabled) {
+			adv = !data->advertised ? cap :
+			    ethtool_adv_to_mmd_eee_adv_t(data->advertised) & cap;
+			/* Mask prohibited EEE modes */
+			adv &= ~phy_info->eee_broken_modes;
+		}
+		if (old_adv != adv) {
+			ret =
+			    dn200_xpcs_write(phy_info, 0, MDIO_MMD_AN,
+					     MDIO_AN_EEE_ADV, adv);
+			if (ret < 0)
+				return ret;
+
+			/* Restart autonegotiation so the new modes get sent to the
+			 * link partner.
+			 */
+			if (phy_info->an == DN200_AN_ENABLE) {
+				ret = dn200_xpcs_an_config(phy_info);
+				if (ret < 0)
+					return ret;
+			}
+		}
+
+		return 0;
+	}
+	return 0;
+}
+static void mmd_eee_adv_to_linkmode(unsigned long *advertising, u16 eee_adv)
+{
+	linkmode_zero(advertising);
+
+	if (eee_adv & MDIO_EEE_100TX)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+				 advertising);
+	if (eee_adv & MDIO_EEE_1000T)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+				 advertising);
+	if (eee_adv & MDIO_EEE_10GT)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+				 advertising);
+	if (eee_adv & MDIO_EEE_1000KX)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+				 advertising);
+	if (eee_adv & MDIO_EEE_10GKX4)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+				 advertising);
+	if (eee_adv & MDIO_EEE_10GKR)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+				 advertising);
+}
+
+bool phy_check_valid(int speed, int duplex, unsigned long *features)
+{
+	return !!phy_lookup_setting(speed, duplex, features, true);
+}
+
+static int dn200_phy_init_eee(struct dn200_phy_info *phy_info,
+			      bool clk_stop_enable)
+{
+	if (phy_info->phydev) {
+		return phy_init_eee(phy_info->phydev, clk_stop_enable);
+	} else if (phy_info->xpcs) {
+		/* According to 802.3az,the EEE is supported only in full duplex-mode.
+		 */
+		if (phy_info->dup == DN200_DUP_FULL) {
+			__ETHTOOL_DECLARE_LINK_MODE_MASK(common);
+			__ETHTOOL_DECLARE_LINK_MODE_MASK(lp);
+			__ETHTOOL_DECLARE_LINK_MODE_MASK(adv);
+			int eee_lp, eee_cap, eee_adv;
+			int status;
+			u32 cap;
+
+			/* Read phy status to properly get the right settings */
+			//status = phy_read_status(phydev);
+			//if (status)
+			//      return status;
+
+			/* First check if the EEE ability is supported */
+			eee_cap =
+			    dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS,
+					    MDIO_PCS_EEE_ABLE);
+			if (eee_cap <= 0)
+				goto eee_exit_err;
+
+			cap = mmd_eee_cap_to_ethtool_sup_t(eee_cap);
+			if (!cap)
+				goto eee_exit_err;
+
+			/* Check which link settings negotiated and verify it in
+			 * the EEE advertising registers.
+			 */
+			eee_lp =
+			    dn200_xpcs_read(phy_info, 0, MDIO_MMD_AN,
+					    MDIO_AN_EEE_LPABLE);
+			if (eee_lp <= 0)
+				goto eee_exit_err;
+
+			eee_adv =
+			    dn200_xpcs_read(phy_info, 0, MDIO_MMD_AN,
+					    MDIO_AN_EEE_ADV);
+			if (eee_adv <= 0)
+				goto eee_exit_err;
+
+			mmd_eee_adv_to_linkmode(adv, eee_adv);
+			mmd_eee_adv_to_linkmode(lp, eee_lp);
+			dn200_linkmode_and(common, adv, lp);
+			if (!phy_check_valid
+			    (phy_info->speed, phy_info->dup, common))
+				goto eee_exit_err;
+			if (clk_stop_enable) {
+				/* Configure the PHY to stop receiving xMII
+				 * clock while it is signaling LPI.
+				 */
+				status =
+				    dn200_xpcs_read(phy_info, 0, MDIO_MMD_PCS,
+						    MDIO_CTRL1);
+				dn200_xpcs_write(phy_info, 0, MDIO_MMD_PCS,
+						 MDIO_CTRL1,
+						 status |
+						 MDIO_PCS_CTRL1_CLKSTOP_EN);
+			}
+			return 0;	/* EEE supported */
+		}
+	}
+eee_exit_err:
+	return -EPROTONOSUPPORT;
+}
+
+static int dn200_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *cmd)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	struct ethtool_link_ksettings *lks = &PRIV_PHY_INFO(priv)->lks;
+
+	DN200_LM_COPY(cmd, supported, lks, supported);
+	DN200_LM_COPY(cmd, advertising, lks, advertising);
+	if (phy_info->an && phy_info->an_sucess)
+		DN200_LM_COPY(cmd, lp_advertising, lks, lp_advertising);
+	if (phy_info->link_status) {
+		cmd->base.speed = phy_info->speed;
+		cmd->base.duplex = phy_info->dup;
+	} else {
+		/* With no link speed and duplex are unknown */
+		cmd->base.speed = SPEED_UNKNOWN;
+		cmd->base.duplex = DUPLEX_UNKNOWN;
+	}
+	if (phy_info->phydev) {
+		extern_phy_mdix_status_get(phy_info->phydev,
+			&cmd->base.eth_tp_mdix, &cmd->base.eth_tp_mdix_ctrl);
+	}
+	cmd->base.phy_address = phy_info->phy_addr;
+	cmd->base.autoneg = phy_info->an;
+	cmd->base.port = phy_info->port_type;
+	return 0;
+}
+
+static void dn200_set_pcie_conf(struct dn200_phy_info *phy_info, u32 offset,
+				u32 val)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	dev_dbg(priv->device, "offset %#x val %#x\n", offset, val);
+	if (priv->speed_cmd)
+		fw_reg_write(&priv->plat_ex->ctrl, 0x24300000 + offset, val);
+	else
+		writel(val, phy_info->xpcs->xpcs_regs_base +
+		       DN200_PCIE_BAROFF + offset);
+}
+
+static void dn200_get_pcie_conf(struct dn200_phy_info *phy_info, u32 offset,
+				u32 *val)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (priv->speed_cmd)
+		fw_reg_read(&priv->plat_ex->ctrl, 0x24300000 + offset, val);
+	else
+		*val = readl(phy_info->xpcs->xpcs_regs_base +
+			  DN200_PCIE_BAROFF + offset);
+}
+
+static void dn200_conf_pcie_common_para(struct dn200_phy_info *phy_info)
+{
+	u32 reg_val;
+
+	dn200_set_pcie_conf(phy_info, 0x1c90, 0x1);
+	dn200_set_pcie_conf(phy_info, 0x1c60, 0x96);
+	dn200_set_pcie_conf(phy_info, 0x1c80, 0x1);
+	dn200_set_pcie_conf(phy_info, 0x1c88, 0x2);
+	dn200_get_pcie_conf(phy_info, 0x24e8, &reg_val);
+	reg_val |= BIT(8);
+	reg_val &= ~(GENMASK(14, 10));
+	reg_val |= (0x7 << 10);
+	dn200_set_pcie_conf(phy_info, 0x24e8, reg_val);
+	dn200_set_pcie_conf(phy_info, 0x1c20, 0x4b);
+	dn200_set_pcie_conf(phy_info, 0x1c18, 0x63f);
+	dn200_set_pcie_conf(phy_info, 0x1c10, 0x63f);
+	dn200_set_pcie_conf(phy_info, 0x181c, 0x3);
+	dn200_set_pcie_conf(phy_info, 0x2180, 0x5);
+	dn200_get_pcie_conf(phy_info, 0x1b90, &reg_val);
+	reg_val |= BIT(6) | BIT(7);
+	dn200_set_pcie_conf(phy_info, 0x1b90, reg_val);
+}
+
+static void dn200_conf_pcie_10G_tx_para(struct dn200_phy_info *phy_info)
+{
+	dn200_set_pcie_conf(phy_info, 0x21c0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x0);
+	dn200_set_pcie_conf(phy_info, 0x2190 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x8);
+	dn200_set_pcie_conf(phy_info, 0x2198 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x8);
+	dn200_set_pcie_conf(phy_info, 0x21e8 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4f);
+	dn200_set_pcie_conf(phy_info, 0x21f0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4f);
+	dn200_set_pcie_conf(phy_info, 0x2220 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4);
+	dn200_set_pcie_conf(phy_info, 0x2238 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x3);
+	dn200_set_pcie_conf(phy_info, 0x2240 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x2);
+	dn200_set_pcie_conf(phy_info, 0x2218 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x21d0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x21f8 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x2108 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x6);
+	dn200_set_pcie_conf(phy_info, 0x2228 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x5);
+	dn200_set_pcie_conf(phy_info, 0x2208 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0xb);
+	dn200_set_pcie_conf(phy_info, 0x2210 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4);
+	dn200_set_pcie_conf(phy_info, 0x2248 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x2);
+}
+
+static void dn200_conf_pcie_10G_rx_para(struct dn200_phy_info *phy_info)
+{
+	dn200_set_pcie_conf(phy_info, 0x1d58 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x13);
+
+
+	/*eq_ctle_boost */
+	dn200_set_pcie_conf(phy_info, 0x1dc0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x10);
+	/*eq_ctle_pole */
+	dn200_set_pcie_conf(phy_info, 0x1dc8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	/*eq_afe_rate */
+	dn200_set_pcie_conf(phy_info, 0x1db0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x6);
+	/*eq_vga_gain */
+	dn200_set_pcie_conf(phy_info, 0x1de8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x10);
+	/*eq_afe_config */
+	dn200_set_pcie_conf(phy_info, 0x1da8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x514);
+	dn200_set_pcie_conf(phy_info, 0x1dd8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xc);
+	dn200_set_pcie_conf(phy_info, 0x1de0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x80);
+	dn200_set_pcie_conf(phy_info, 0x1d78 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x6);
+	dn200_set_pcie_conf(phy_info, 0x1d60 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x403);
+	dn200_set_pcie_conf(phy_info, 0x1d70 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xb);
+	dn200_set_pcie_conf(phy_info, 0x1d68 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xb);
+	dn200_set_pcie_conf(phy_info, 0x1e20 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x4);
+	dn200_set_pcie_conf(phy_info, 0x1e18 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x2);
+	dn200_set_pcie_conf(phy_info, 0x1df0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);
+	dn200_set_pcie_conf(phy_info, 0x1e30 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x1e38 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x2);
+	dn200_set_pcie_conf(phy_info, 0x1e00 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x1df8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x1d40 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x1d90 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);
+	dn200_set_pcie_conf(phy_info, 0x1d48 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x4);
+	/*rx_dfe_bypass */
+	dn200_set_pcie_conf(phy_info, 0x1d80 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);
+	/* rx_vco_ld_val=1650&&rx_ref_ld_val=20=>rx_clk */
+	dn200_set_pcie_conf(phy_info, 0x1e08 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x14);
+	dn200_set_pcie_conf(phy_info, 0x1e40 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x672);
+}
+
+static void dn200_conf_pcie_1G_tx_para(struct dn200_phy_info *phy_info)
+{
+	writel(0x81,
+	       phy_info->xpcs->xpcs_regs_base +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(phy_info->xpcs_idx));
+	/*tx_misc */
+	dn200_set_pcie_conf(phy_info, 0x21c0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x80);/*tx*/
+
+	dn200_set_pcie_conf(phy_info, 0x2190 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x8);/*tx*/
+	dn200_set_pcie_conf(phy_info, 0x2198 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x8);/*tx*/
+	/*cp_ctl_intg */
+	dn200_set_pcie_conf(phy_info, 0x21e8 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x48);/*tx*/
+	/*cp_ctl_prog */
+	dn200_set_pcie_conf(phy_info, 0x21f0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x61);/*tx*/
+
+	dn200_set_pcie_conf(phy_info, 0x2220 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4);/*tx*/
+
+	dn200_set_pcie_conf(phy_info, 0x2238 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x3);/*tx*/
+	/*vco_low_freq */
+	dn200_set_pcie_conf(phy_info, 0x2240 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x3);/*tx*/
+	/*postdiv */
+	dn200_set_pcie_conf(phy_info, 0x2218 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x1);/*tx*/
+	/*tx_rate */
+	dn200_set_pcie_conf(phy_info, 0x21d0 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x4);/*tx*/
+	/*div16P5_clk */
+	dn200_set_pcie_conf(phy_info, 0x21f8 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x0);/*tx*/
+	/*word_clk_freq */
+	dn200_set_pcie_conf(phy_info, 0x2108 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x0);/*tx*/
+	/*ropll_refdiv */
+	dn200_set_pcie_conf(phy_info, 0x2228 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0xf);/*tx*/
+	/*ropll_fbdiv */
+	dn200_set_pcie_conf(phy_info, 0x2208 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x20);/*tx*/
+	/*ropll_out_div */
+	dn200_set_pcie_conf(phy_info, 0x2210 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x5);/*tx*/
+	/*word_clk_div_sel */
+	dn200_set_pcie_conf(phy_info, 0x2248 + (3 - phy_info->xpcs_idx) * 0xd0,
+			    0x3);/*tx*/
+	writel(0x87,
+	       phy_info->xpcs->xpcs_regs_base +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(phy_info->xpcs_idx));
+	udelay(100);
+}
+
+static void dn200_conf_pcie_1G_rx_para(struct dn200_phy_info *phy_info)
+{
+	writel(0x81,
+	       phy_info->xpcs->xpcs_regs_base +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(phy_info->xpcs_idx));
+	dn200_set_pcie_conf(phy_info, 0x1e08 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x11);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1e40 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x550);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1d58 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x13);/*rx*/
+
+	/*eq_ctle_boost */
+	dn200_set_pcie_conf(phy_info, 0x1dc0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xC);/*rx*/
+	/*eq_ctle_pole */
+	dn200_set_pcie_conf(phy_info, 0x1dc8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);/*rx*/
+	/*eq_afe_rate */
+	dn200_set_pcie_conf(phy_info, 0x1db0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x7);/*rx*/
+	/*eq_vga_gain */
+	dn200_set_pcie_conf(phy_info, 0x1de8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x14);/*rx*/
+	/*eq_afe_config */
+	dn200_set_pcie_conf(phy_info, 0x1da8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x514);/*rx*/
+	/*afe_tap1 */
+	dn200_set_pcie_conf(phy_info, 0x1dd8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1de0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x80);/*rx*/
+	/*delta_iq */
+	dn200_set_pcie_conf(phy_info, 0x1d78 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1d60 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x403);/*rx*/
+
+	dn200_set_pcie_conf(phy_info, 0x1d70 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xb);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1d68 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0xb);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1e20 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x4);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1e18 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x2);/*rx*/
+	/*rx_misc */
+	dn200_set_pcie_conf(phy_info, 0x1df0 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x80);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1e38 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x2);/*rx*/
+	/*rx_rate */
+	dn200_set_pcie_conf(phy_info, 0x1e00 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x4);/*rx*/
+	/*rx_dfe_bypass */
+	dn200_set_pcie_conf(phy_info, 0x1d80 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1df8 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);/*rx*/
+	dn200_set_pcie_conf(phy_info, 0x1d40 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x1);/*rx*/
+	/*16P5_clk */
+	dn200_set_pcie_conf(phy_info, 0x1d90 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);/*rx*/
+	/*adapt_mode */
+	dn200_set_pcie_conf(phy_info, 0x1d48 + (3 - phy_info->xpcs_idx) * 0x110,
+			    0x0);/*rx*/
+	writel(0x87,
+	       phy_info->xpcs->xpcs_regs_base +
+	       XGE_XGMAC_CLK_MUX_ENABLE_CTRL(phy_info->xpcs_idx));
+	udelay(100);
+}
+
+
+static int  _phy_reg_read(struct dn200_phy_info *phy_info,
+					u16 phy_reg_addr, u16 *reg_val);
+int dn200_phy_clock_stable_judge(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+	int clock_state = 0;
+	int ret = 0;
+	unsigned long out_time_start = 0;
+	unsigned long in_time_start = 0;
+
+	out_time_start = jiffies;
+	while (true) {
+		_phy_reg_read(phy_info, 0x1069 + (3 - phy_info->xpcs_idx) * 0x200, &reg_val);
+		in_time_start = jiffies;
+		if (reg_val & BIT(15)) {
+			while (true) {
+				if (time_after(jiffies, in_time_start + usecs_to_jiffies(2000))) {
+					clock_state = 1;
+					break;
+				}
+				_phy_reg_read(phy_info, 0x1069 + (3 - phy_info->xpcs_idx) * 0x200, &reg_val);
+				if (!(reg_val & BIT(15))) {
+					clock_state = 0;
+					break;
+				}
+				usleep_range(10, 20);
+			}
+		}
+
+		if (clock_state)
+			break;
+		if (time_after(jiffies, out_time_start + msecs_to_jiffies(2000))) {
+			clock_state = 0;
+			break;
+		}
+		usleep_range(10, 20);
+	}
+
+	if (!clock_state)
+		ret = -1;
+	return ret;
+}
+
+static int dn200_xpcs_switch_to_1G_tx_regset(struct dn200_phy_info *phy_info);
+static int dn200_xpcs_switch_to_1G_rx_regset(struct dn200_phy_info *phy_info);
+static int dn200_xpcs_switch_to_10G_rx_regset(struct dn200_phy_info *phy_info);
+static int dn200_xpcs_switch_to_10G_tx_regset(struct dn200_phy_info *phy_info);
+static int _phy_reg_write(struct dn200_phy_info *phy_info, u16 phy_reg_addr,
+			  u16 reg_val);
+static void dn200_xpcs_prepare_switch_speed(struct dn200_phy_info *phy_info)
+{
+	u32 reg_val;
+
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL);
+	reg_val |= TX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL, reg_val);
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1);
+	reg_val |= RX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1, reg_val);
+	usleep_range(20, 30);
+	_phy_reg_write(phy_info, 0x2036 + 0x200 * (3 - phy_info->xpcs_idx), 0x3);
+	usleep_range(20, 30);
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL);
+	reg_val &= ~TX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL, reg_val);
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1);
+	reg_val &= ~RX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1, reg_val);
+}
+
+static void dn200_xpcs_switch_to_10G(struct dn200_phy_info *phy_info)
+{
+	u16 phy_val, phy_val1;
+	int try = 0;
+
+	dn200_phy_set_rs_mode(phy_info, true);
+	dn200_xpcs_prepare_switch_speed(phy_info);
+	/*wait tx rest done*/
+	while (try++ < 100000) {
+		_phy_reg_read(phy_info, 0x2034 + 0x200 * (3 - phy_info->xpcs_idx), &phy_val);
+		if ((phy_val & BIT(0))) {
+			netdev_dbg(phy_info->dev, "%s %d wait tx rest_done\n", __func__, __LINE__);
+			break;
+		}
+		usleep_range(10, 20);
+	};
+	dn200_conf_pcie_10G_tx_para(phy_info);
+	dn200_xpcs_switch_to_10G_tx_regset(phy_info);
+	usleep_range(10, 20);
+	_phy_reg_write(phy_info, 0x2036 + 0x200 * (3 - phy_info->xpcs_idx), 0x2);
+	/*wait rx rest done*/
+	while (try++ < 100000) {
+		_phy_reg_read(phy_info, 0x2035 + 0x200 * (3 - phy_info->xpcs_idx), &phy_val1);
+		if ((phy_val1 & BIT(0))) {
+			netdev_dbg(phy_info->dev, "%s %d wait rx rest_done\n", __func__, __LINE__);
+			break;
+		}
+		usleep_range(10, 20);
+	};
+	dn200_conf_pcie_10G_rx_para(phy_info);
+	dn200_xpcs_switch_to_10G_rx_regset(phy_info);
+	_phy_reg_write(phy_info, 0x2036 + 0x200 * (3 - phy_info->xpcs_idx), 0x0);
+	usleep_range(10, 20);
+	dn200_link_status_reset(phy_info, true);
+}
+
+static void dn200_xpcs_switch_to_1G(struct dn200_phy_info *phy_info)
+{
+	u16 phy_val, phy_val1;
+	int try = 0;
+
+	dn200_phy_set_rs_mode(phy_info, false);
+	dn200_xpcs_prepare_switch_speed(phy_info);
+	/*wait tx rest done*/
+	while (try++ < 100000) {
+		_phy_reg_read(phy_info, 0x2034 + 0x200 * (3 - phy_info->xpcs_idx), &phy_val);
+		if ((phy_val & BIT(0))) {
+			netdev_dbg(phy_info->dev, "%s %d wait tx rest_done\n", __func__, __LINE__);
+			break;
+		}
+		usleep_range(10, 20);
+	};
+	dn200_conf_pcie_1G_tx_para(phy_info);
+	dn200_xpcs_switch_to_1G_tx_regset(phy_info);
+	usleep_range(10, 20);
+	_phy_reg_write(phy_info, 0x2036 + 0x200 * (3 - phy_info->xpcs_idx), 0x2);
+	/*wait rx rest done*/
+	while (try++ < 100000) {
+		_phy_reg_read(phy_info, 0x2035 + 0x200 * (3 - phy_info->xpcs_idx), &phy_val1);
+		if ((phy_val1 & BIT(0))) {
+			netdev_dbg(phy_info->dev, "%s %d wait rx rest_done\n", __func__, __LINE__);
+			break;
+		}
+		usleep_range(10, 20);
+	};
+	dn200_conf_pcie_1G_rx_para(phy_info);
+	dn200_xpcs_switch_to_1G_rx_regset(phy_info);
+	_phy_reg_write(phy_info, 0x2036 + 0x200 * (3 - phy_info->xpcs_idx), 0x0);
+	usleep_range(10, 20);
+	dn200_link_status_reset(phy_info, true);
+}
+
+static int dn200_set_link_ksettings(struct net_device *netdev,
+				    const struct ethtool_link_ksettings *cmd)
+{
+	struct dn200_priv *priv = netdev_priv(netdev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	struct ethtool_link_ksettings *lks = &PRIV_PHY_INFO(priv)->lks;
+	enum an_state curr_an = phy_info->an;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+	int ret;
+
+	if (priv->plat_ex->has_xpcs) {
+		if (cmd->base.speed != phy_info->speed &&
+		    cmd->base.speed <= phy_info->max_speed &&
+		    cmd->base.speed != phy_info->setting_speed) {
+			if (cmd->base.speed == SPEED_10000) {
+				phy_info->speed = cmd->base.speed;
+				netdev_info(netdev, "succeeded to switch to %d\n",
+					    cmd->base.speed);
+			} else if (cmd->base.speed == SPEED_1000) {
+				/*switch mac's speed and then switch xpcs's speed */
+				netdev_info(netdev, "succeeded to switch to %d\n",
+					    cmd->base.speed);
+			} else {
+				netdev_err(netdev, "unsupported speed %d\n",
+					   cmd->base.speed);
+				return -EINVAL;
+			}
+		}
+		if (cmd->base.duplex != phy_info->dup) {
+			netdev_info(netdev, "unsupported duplex %#x\n",
+				   cmd->base.duplex);
+		}
+	}
+
+	if (cmd->base.phy_address != phy_info->phy_addr) {
+		netdev_err(netdev, "invalid phy address %#x\n",
+			   cmd->base.phy_address);
+		return -EINVAL;
+	}
+
+	if ((cmd->base.autoneg != AUTONEG_ENABLE) &&
+	    (cmd->base.autoneg != AUTONEG_DISABLE)) {
+		netdev_err(netdev, "unsupported autoneg %#x\n",
+			   cmd->base.autoneg);
+		return -EINVAL;
+	}
+
+	if (cmd->base.autoneg == AUTONEG_DISABLE) {
+		if (!priv->mii && cmd->base.duplex != DUPLEX_FULL) {
+			netdev_err(netdev, "unsupported duplex %#x\n",
+				   cmd->base.duplex);
+			return -EINVAL;
+		}
+	}
+
+	netif_dbg(priv, link, netdev,
+		  "requested advertisement 0x%*pb, phy supported 0x%*pb\n",
+		  __ETHTOOL_LINK_MODE_MASK_NBITS, cmd->link_modes.advertising,
+		  __ETHTOOL_LINK_MODE_MASK_NBITS, lks->link_modes.supported);
+
+	bitmap_and(advertising,
+		   cmd->link_modes.advertising, lks->link_modes.supported,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	if ((cmd->base.autoneg == AUTONEG_ENABLE) &&
+	    bitmap_empty(advertising, __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+		netdev_err(netdev, "unsupported requested advertisement\n");
+		return -EINVAL;
+	}
+
+	if (cmd->base.port != phy_info->port_type) {
+		netdev_err(netdev,
+			   "unsupported port type %#x\n", cmd->base.port);
+		return -EINVAL;
+	}
+
+	ret = 0;
+	set_bit(DN200_PHY_IN_RESET, &phy_info->phy_state);
+	phy_info->speed = cmd->base.speed;
+	if (!priv->plat_ex->has_xpcs)
+		phy_info->dup = cmd->base.duplex;
+	bitmap_copy(lks->link_modes.advertising, advertising,
+		    __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	phy_info->setting_speed = cmd->base.speed;
+	if (cmd->base.speed != phy_info->last_link_speed) {
+		phy_info->cur_an = cmd->base.autoneg;
+		if (phy_info->cur_an == AUTONEG_ENABLE)
+			DN200_SET_ADV(lks, Autoneg);
+		else
+			DN200_CLR_ADV(lks, Autoneg);
+		if (!priv->mii) {
+			if (phy_info->phy_multispeed_work.work.func)
+				cancel_delayed_work_sync(&phy_info->phy_multispeed_work);
+			dn200_normal_reset(priv);
+		} else
+			clear_bit(DN200_PHY_IN_RESET, &phy_info->phy_state);
+	} else {
+		clear_bit(DN200_PHY_IN_RESET, &phy_info->phy_state);
+		DN200_SET_SUP(lks, Autoneg);
+		phy_info->cur_an = cmd->base.autoneg;
+		if (phy_info->cur_an == AUTONEG_ENABLE)
+			DN200_SET_ADV(lks, Autoneg);
+		else
+			DN200_CLR_ADV(lks, Autoneg);
+	}
+	if (curr_an != phy_info->cur_an && !phy_info->phydev) {
+		if (phy_info->link_status) {
+			phy_info->link_status = DN200_LINK_DOWN;
+			dn200_phy_info_state_change(phy_info);
+			dn200_phy_print_status(phy_info);
+			netif_carrier_off(phy_info->dev);
+			phy_info->sfp_rx_los = true;
+		}
+	}
+	if (phy_info->phydev && cmd->base.eth_tp_mdix_ctrl) {
+		extern_phy_mdix_status_set(phy_info->phydev,
+			cmd->base.eth_tp_mdix_ctrl);
+	}
+	phy_info->an = phy_info->cur_an;
+	if (netif_running(netdev) && phy_info->phydev)
+		ret = phy_info->phy_ops->an_config(phy_info);
+
+	return ret;
+}
+
+static int dn200_get_phy_pauseparam(struct dn200_phy_info *phy_info,
+				    struct ethtool_pauseparam *pause)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (phy_info->phydev)
+		pause->autoneg = priv->flow_ctrl_an;
+	else
+		pause->autoneg = phy_info->an;
+	pause->rx_pause = !!(phy_info->pause & MLO_PAUSE_RX);
+	pause->tx_pause = !!(phy_info->pause & MLO_PAUSE_TX);
+
+	return 0;
+}
+
+static int dn200_set_phy_pauseparam(struct dn200_phy_info *phy_info,
+				    struct ethtool_pauseparam *pause)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (phy_info->phydev) {
+		if (!pause->autoneg) {
+			phy_info->pause = 0;
+			if (pause->tx_pause)
+				phy_info->pause |= MLO_PAUSE_TX;
+			if (pause->rx_pause)
+				phy_info->pause |= MLO_PAUSE_RX;
+
+			priv->flow_ctrl = phy_info->pause;
+		} else
+			priv->flow_ctrl = MLO_PAUSE_NONE;
+
+		priv->flow_ctrl_an = pause->autoneg;
+	} else {
+		phy_info->pause = 0;
+		if (pause->tx_pause)
+			phy_info->pause |= MLO_PAUSE_TX;
+		if (pause->rx_pause)
+			phy_info->pause |= MLO_PAUSE_RX;
+
+		if (pause->autoneg != phy_info->an) {
+			netdev_info(phy_info->dev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
+			return -EOPNOTSUPP;
+		}
+		priv->flow_ctrl = phy_info->pause;
+
+		DN200_CLR_ADV(lks, Pause);
+		DN200_CLR_ADV(lks, Asym_Pause);
+
+		if (pause->rx_pause) {
+			DN200_SET_ADV(lks, Pause);
+			DN200_SET_ADV(lks, Asym_Pause);
+		}
+
+		if (pause->tx_pause) {
+			/* Equivalent to XOR of Asym_Pause */
+			if (DN200_ADV(lks, Asym_Pause))
+				DN200_CLR_ADV(lks, Asym_Pause);
+			else
+				DN200_SET_ADV(lks, Asym_Pause);
+		}
+	}
+
+	dn200_normal_reset(priv);
+
+	return 0;
+}
+
+static int dn200_phy_loopback(struct dn200_phy_info *phy_info, bool enable)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	int ret = 0;
+	u32 value = 0;
+	u32 link_status = DN200_LINK_DOWN;
+
+	if (PRIV_IS_VF(priv))
+		return -EOPNOTSUPP;
+
+	if (phy_info->phydev) {
+		ret = phy_loopback(phy_info->phydev, enable);
+	} else if (phy_info->xpcs) {
+		value =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, MII_BMCR);
+		if (enable)
+			value |= PMA_CTRL1_LB;
+		else
+			value &= ~PMA_CTRL1_LB;
+		dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+				 MII_BMCR, value);
+		ret =
+		    readl_poll_timeout(phy_info->phy_ops->link_status,
+				       link_status, !(link_status & BIT(0)),
+				       500, 1000000);
+	}
+	return ret;
+}
+
+static int dn200_phy_link_train_config(struct dn200_phy_info *phy_info);
+static int dn200_nway_reset(struct dn200_phy_info *phy_info)
+{
+	int ret = -EOPNOTSUPP;
+
+	if (phy_info->phydev)
+		ret = phy_restart_aneg(phy_info->phydev);
+	else {
+		phy_info->link_status = DN200_LINK_DOWN;
+		phy_info->an_sucess = false;
+		ret = dn200_phy_link_train_config(phy_info);
+	}
+	return ret;
+}
+
+static void dn200_link_status_reset(struct dn200_phy_info *phy_info, bool enable)
+{
+	u32 reg_val = 0;
+	int ret = 0;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S,
+				   0);
+	reg_val = readl(priv->ioaddr + XGE_TOP_CONFIG_OFFSET + 0x1c + phy_info->xpcs_idx * 0x50);
+	reg_val &= ~(BIT(2) | BIT(1));
+	writel(reg_val, priv->ioaddr + XGE_TOP_CONFIG_OFFSET + 0x1c + phy_info->xpcs_idx * 0x50);
+	usleep_range(10, 15);
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			    VR_RX_GENCTRL0);
+	reg_val &= ~RX_DT_EN_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_RX_GENCTRL0, reg_val);
+	usleep_range(10, 15);
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			    VR_RX_GENCTRL0);
+	reg_val |= RX_DT_EN_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL0, reg_val);
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL);
+	usleep_range(10, 15);
+	reg_val |= TX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL, reg_val);
+	usleep_range(10, 15);
+	reg_val &= ~TX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_TX_GENCTRL, reg_val);
+	usleep_range(10, 15);
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1);
+	reg_val |= RX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1, reg_val);
+	usleep_range(10, 15);
+	reg_val &= ~RX_RST_0;
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+						VR_RX_GENCTRL1, reg_val);
+	usleep_range(10, 15);
+	reg_val = readl(priv->ioaddr + XGE_TOP_CONFIG_OFFSET + 0x1c + phy_info->xpcs_idx * 0x50);
+	reg_val |= (BIT(2) | BIT(1));
+	writel(reg_val, priv->ioaddr + XGE_TOP_CONFIG_OFFSET + 0x1c + phy_info->xpcs_idx * 0x50);
+	usleep_range(1000, 2000);
+	dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT1);
+	dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+	if (enable) {
+		ret = dn200_phy_clock_stable_judge(phy_info);
+		if (ret)
+			usleep_range(10000, 20000);
+	}
+}
+
+static void dn200_speed_set(struct dn200_phy_info *phy_info, u32 speed);
+
+static void dn200_tx_xnp(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val;
+
+	/*send null page */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			 SR_AN_XNP_TX3, 0);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			 SR_AN_XNP_TX2, 0);
+	reg_val = (BIT(13) | BIT(0));
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			 SR_AN_XNP_TX1, reg_val);
+	usleep_range(100, 200);
+}
+
+static u16 dn200_an_rx_intr_get(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+	unsigned long an_timer_start = 0;
+
+	an_timer_start = jiffies;
+	while (!time_after(jiffies, an_timer_start + msecs_to_jiffies(1000))) {
+		reg_val =
+			dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+							VR_AN_INTR);
+		if (reg_val) {
+			netdev_dbg(phy_info->dev, "%s recv an intr %#x\n",
+						__func__, reg_val);
+			/* Clear intr status and enable xpcs intr */
+			dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+								MDIO_MMD_AN, VR_AN_INTR, GENMASK(2, 0),
+								0, 0);
+			return reg_val;
+		}
+		usleep_range(100, 200);
+	}
+	netdev_dbg(phy_info->dev,
+				"%s lwait for AN_PG_RCV intr timeout\n", __func__);
+	return 0;
+}
+
+#define DN200_MAX_AN_LINK_UP_TIME	(1000)
+#define DN200_AN_LINK_UP_WAIT_INTR	(1)
+#define DN200_AN_SUCCESS_LINK_UP_SUCESS	(DN200_MAX_AN_LINK_UP_TIME + 100)
+static int dn200_rx_train_sw_process(struct dn200_phy_info *phy_info);
+static void dn200_kr_train_disable(struct dn200_phy_info *phy_info);
+static int dn200_10G_rx_train_set(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0, ad_reg = 0, ctrl_val = 0;
+	unsigned long an_timer_start = 0;
+	int ret = 0;
+
+	if (phy_info->speed == SPEED_10000) {
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV2, GENMASK(15, 0), 0, 0x80);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_VEND2,
+			   SR_MII_CTRL, AN_ENABLE, AN_ENABLE_SHIFT, 0);
+	} else if (phy_info->speed == SPEED_1000) {
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV2, GENMASK(15, 0), 0, 0x20);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_VEND2,
+			   SR_MII_CTRL, AN_ENABLE, AN_ENABLE_SHIFT, 1);
+	}
+	usleep_range(50000, 70000);
+	an_timer_start = jiffies;
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			   VR_AN_INTR, GENMASK(2, 0), 0, 0);
+	/* Enable C73 auto-negotiation */
+	/*2: enable an */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S, 1);
+	usleep_range(100, 200);
+recv_intr:
+	if (time_after(jiffies, an_timer_start + msecs_to_jiffies(2000))) {
+		netdev_dbg(phy_info->dev, "%s an timeout received\n", __func__);
+		goto disable_an;
+	}
+	reg_val = dn200_an_rx_intr_get(phy_info);
+	if (reg_val & AN_PG_RCV) {
+		netdev_dbg(phy_info->dev, "%s AN_PG_RCV received\n", __func__);
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL2);
+		if (!(reg_val & BIT(7))) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner does not support 10G KR %#x\n",
+				   __func__, reg_val);
+			goto disable_an;
+		}
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL3);
+		if ((reg_val & GENMASK(15, 14)) != GENMASK(15, 14)) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner does not support 10G fec %#x\n",
+				   __func__, reg_val);
+			goto disable_an;
+		}
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL1);
+		ad_reg =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_ADV1);
+		if ((reg_val & ADVERTISE_NPAGE) || (ad_reg & ADVERTISE_NPAGE)) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner need np %#x\n", __func__,
+				   reg_val);
+			dn200_tx_xnp(phy_info);
+		} else {
+			goto link_train;
+		}
+XNP_RECV:
+		reg_val = dn200_an_rx_intr_get(phy_info);
+		if (reg_val & AN_PG_RCV) {
+			reg_val =
+			    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					    MDIO_MMD_AN, AN_LP_XNP_ABL1);
+			ad_reg =
+			    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					    MDIO_MMD_AN, SR_AN_XNP_TX1);
+			if ((reg_val & AN_ADV_NP) || (ad_reg & AN_ADV_NP)) {
+				netdev_dbg(phy_info->dev,
+					   "%s link partner need np %#x\n",
+					   __func__, reg_val);
+				dn200_tx_xnp(phy_info);
+				goto XNP_RECV;
+			} else {
+				reg_val = 0;
+				goto link_train;
+			}
+		} else {
+			if (!time_after(jiffies, an_timer_start + msecs_to_jiffies(2000)))
+				goto XNP_RECV;
+			goto disable_an;
+		}
+
+link_train:
+		ctrl_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_FEC_CTRL);
+		ctrl_val &=
+		    ~(MDIO_PMA_10GBR_FECABLE_ABLE |
+		      MDIO_PMA_10GBR_FECABLE_ERRABLE);
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL3);
+		ad_reg =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_ADV3);
+		if ((reg_val & (KR10G_FEC_ABL | KR10G_FEC_REQ))
+		    && (ad_reg & (KR10G_FEC_ABL | KR10G_FEC_REQ))) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner support fec %#x local %#x\n",
+				   __func__, reg_val, ad_reg);
+			ctrl_val |= (MDIO_PMA_10GBR_FECABLE_ABLE);
+		}
+		if (phy_info->speed == SPEED_10000) {
+			dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+					SR_PMA_KR_FEC_CTRL, ctrl_val);
+			ret = dn200_rx_train_sw_process(phy_info);
+			usleep_range(20000, 30000);
+		}
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_RX_GENCTRL0,
+				   RX_DT_EN_0_MASK, RX_DT_EN_0_S, 1);
+		usleep_range(2000, 3000);
+		if (ret < 0)
+			goto disable_an;
+
+		/* Wait for link up */
+		an_timer_start = jiffies;
+		while (true) {
+			if (phy_info->speed == SPEED_10000) {
+				reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS,
+						MDIO_PCS_10GBRT_STAT1);
+				if (reg_val & XPCS_10G_PLU) {
+					netdev_dbg(phy_info->dev,
+						"%s link up as 10G speed mode\n",
+						__func__);
+					break;
+				}
+			} else if (phy_info->speed == SPEED_1000) {
+				reg_val =
+				    dn200_phy_read(phy_info, MDIO_MMD_PCS,
+						   XS_PCS_LSTS);
+				if (reg_val & MDIO_PHYXS_LNSTAT_ALIGN) {
+					netdev_dbg(phy_info->dev,
+						"%s link up as 1G speed mode\n",
+						__func__);
+					break;
+				}
+			}
+			if (time_after(jiffies, an_timer_start + msecs_to_jiffies(DN200_MAX_AN_LINK_UP_TIME))) {
+				netdev_dbg(phy_info->dev,
+					"%s wait for link up timeout\n", __func__);
+				goto disable_an;
+			}
+			usleep_range(DN200_AN_LINK_UP_WAIT_INTR * 1000, (DN200_AN_LINK_UP_WAIT_INTR + 1) * 1000);
+		}
+		/* Waiting for AN_INT_CMPLT */
+		while (true) {
+			reg_val = dn200_phy_read(phy_info, MDIO_MMD_AN, VR_AN_INTR);
+			if (reg_val & BIT(0)) {
+				netdev_dbg(phy_info->dev, "AN_INT_CMPLT received\n");
+				break;
+			}
+			if (time_after(jiffies, an_timer_start + msecs_to_jiffies(DN200_MAX_AN_LINK_UP_TIME))) {
+				netdev_dbg(phy_info->dev,
+				    "%s wait for AN_INT_CMPLT received timeout\n", __func__);
+				goto disable_an;
+			}
+			usleep_range(DN200_AN_LINK_UP_WAIT_INTR * 1000, (DN200_AN_LINK_UP_WAIT_INTR + 1) * 1000);
+		}
+		/*read clear link remote fault */
+		dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+		/*wait 500ms for phy clock reset complete */
+		usleep_range(DN200_AN_SUCCESS_LINK_UP_SUCESS * 1000, (DN200_AN_SUCCESS_LINK_UP_SUCESS + 1) * 1000);
+		if (phy_info->speed == SPEED_10000) {
+			reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS,
+					MDIO_PCS_10GBRT_STAT1);
+			if (!(reg_val & XPCS_10G_PLU)) {
+				netdev_dbg(phy_info->dev,
+						"%s link up as %d speed mode failed!\n",
+						__func__, phy_info->speed);
+				goto disable_an;
+			}
+		} else if (phy_info->speed == SPEED_1000) {
+			reg_val =
+				dn200_phy_read(phy_info, MDIO_MMD_PCS,
+						XS_PCS_LSTS);
+			if (!(reg_val & MDIO_PHYXS_LNSTAT_ALIGN)) {
+				netdev_dbg(phy_info->dev,
+						"%s link up as %d speed mode failed!\n",
+						__func__, phy_info->speed);
+				goto disable_an;
+			}
+		}
+		phy_info->an_sucess = true;
+		// phy_info->speed = SPEED_10000;
+	} else if (reg_val & AN_INC_LINK) {
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL2);
+		if ((reg_val & BIT(7))) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner support 10G KR %#x\n",
+				   __func__, reg_val);
+			/* Clear intr status and enable xpcs intr */
+			dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+					   MDIO_MMD_AN, VR_AN_INTR, GENMASK(2, 0),
+					   0, 0);
+			goto recv_intr;
+		}
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    SR_AN_LP_ABL3);
+		if ((reg_val & GENMASK(15, 14)) == GENMASK(15, 14)) {
+			netdev_dbg(phy_info->dev,
+				   "%s link partner support 10G FEC %#x\n",
+				   __func__, reg_val);
+			/* Clear intr status and enable xpcs intr */
+			dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+					   MDIO_MMD_AN, VR_AN_INTR, GENMASK(2, 0),
+					   0, 0);
+			goto recv_intr;
+		}
+		netdev_dbg(phy_info->dev,
+			   "%s  AN_INC_LINK received and  link partner does not support 10GKR speed mode\n",
+			   __func__);
+		goto disable_an;
+	} else if (reg_val & AN_INT_CMPLT) {
+		netdev_dbg(phy_info->dev,
+			   "%s  AN_INT_CMPLT received and continues to work in 10GKR speed mode\n",
+			   __func__);
+		goto disable_an;
+	} else {
+		goto disable_an;
+	}
+	return 0;
+disable_an:
+	phy_info->an_sucess = false;
+	// phy_info->speed = phy_info->setting_speed;
+	dn200_kr_train_disable(phy_info);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_RX_GENCTRL0, RX_DT_EN_0_MASK, RX_DT_EN_0_S, 1);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S, 0);
+	/* reset link status for AN failure caused by "block lock fail" */
+	dn200_link_status_reset(phy_info, false);
+	phy_info->speed_reset_time = jiffies + DN200_SFP_RESET_TIME;
+	return 0;
+}
+
+static int dn200_rx_eq_process(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0, ctrl_val = 0;
+	u16 cff_upd0 = 0, cff_upd1 = 0, cff_updtm1 = 0;
+	int retry = 1000;
+
+	/*rx_ad_req start */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_XS_PMA_MP_32G_RX_EQ_CTRL4, RX_AD_REQ, RX_AD_REQ_S,
+			   1);
+	usleep_range(10, 20);
+
+	while (retry-- > 0) {
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD,
+				    VR_XS_PMA_MP_12G_16G_25G_MISC_STS);
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, VR_PMA_PHY_RX_EQ_CEU);
+		ctrl_val = 0;
+		if ((reg_val & CFF_UPDT_VLD_ALL) != 0) {
+			netdev_dbg(phy_info->dev,
+				   "%s %d VR_PMA_PHY_RX_EQ_CEU %#x\n", __func__,
+				   __LINE__, reg_val);
+			if (reg_val & CFF_UPDT1_VLD) {
+				cff_upd1 = ((reg_val & CFF_UPDT1) >> 4);
+				if (cff_upd1 == 0x2)
+					cff_upd1 = 0;
+				ctrl_val |= CFF_UPDT1_VLD;
+				ctrl_val |= (cff_upd1 << 4);
+			}
+			if (reg_val & CFF_UPDT0_VLD) {
+				cff_upd0 = ((reg_val & CFF_UPDT0) >> 2);
+				if (cff_upd0 == 0x2)
+					cff_upd0 = 0;
+				ctrl_val |= CFF_UPDT0_VLD;
+				ctrl_val |= (cff_upd0 << 2);
+			}
+			if (reg_val & CFF_UPDTM1_VLD) {
+				cff_updtm1 = (reg_val & CFF_UPDTM1);
+				if (cff_updtm1 == 0x2)
+					cff_updtm1 = 0;
+				ctrl_val |= CFF_UPDTM1_VLD;
+				ctrl_val |= (cff_updtm1);
+			}
+
+			reg_val =
+			    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					    MDIO_MMD_PMAPMD, SR_PMA_KR_PMD_STS);
+			if (!(reg_val & BIT(0))) {
+				reg_val =
+				    dn200_xpcs_read(phy_info,
+						    phy_info->phy_addr,
+						    MDIO_MMD_PMAPMD,
+						    VR_PMA_KRTR_RX_EQ_CTRL);
+				reg_val &= ~(GENMASK(5, 0));
+				reg_val |= ctrl_val;
+				netdev_dbg(phy_info->dev,
+					   "RX: %d c+1(%d) c0(%d) c-1(%d)\n",
+					   __LINE__,
+					   (u16) (reg_val & GENMASK(5, 4)) >> 4,
+					   (u16) (reg_val & GENMASK(3, 2)) >> 2,
+					   (u16) (reg_val & GENMASK(1, 0)));
+				dn200_xpcs_write(phy_info, phy_info->phy_addr,
+						 MDIO_MMD_PMAPMD,
+						 VR_PMA_KRTR_RX_EQ_CTRL,
+						 reg_val);
+				phy_info->rx_eq_states = RX_EQ_WAIT_UPDATE;
+			} else {
+				netdev_dbg(phy_info->dev,
+					   "%s %d SR_PMA_KR_PMD_STS %#x LD already ready\n",
+					   __func__, __LINE__, reg_val);
+				phy_info->rx_eq_states = RX_EQ_LD_NOCMD;
+			}
+			dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+					   MDIO_MMD_PMAPMD,
+					   VR_XS_PMA_MP_32G_RX_EQ_CTRL4,
+					   RX_AD_REQ, RX_AD_REQ_S, 0);
+			return 0;
+		}
+		usleep_range(10, 20);
+	}
+	if (retry <= 0)
+		phy_info->rx_eq_states = RX_EQ_LD_NOCMD;
+	return 0;
+}
+
+static const char *dn200_rxeq_state_to_string(struct dn200_phy_info *phy_info)
+{
+	switch (phy_info->rx_eq_states) {
+	case RX_EQ_NONE:
+		return "RX_EQ_NONE";
+	case RX_EQ_WAIT_UPDATE:
+		return "RX_EQ_WAIT_UPDATE";
+	case RX_EQ_SEND_HOLD:
+		return "RX_EQ_SEND_HOLD";
+	case RX_EQ_WAIT_NOTUPDATE:
+		return "RX_EQ_WAIT_NOTUPDATE";
+	case RX_EQ_POLL_COF:
+		return "RX_EQ_POLL_COF";
+	case RX_EQ_READY:
+		return "RX_EQ_READY";
+	case RX_EQ_LD_NOCMD:
+		return "RX_EQ_LD_NOCMD";
+	default:
+		return "error state";
+	}
+	return "error state";
+}
+
+static const char *dn200_txeq_state_to_string(struct dn200_phy_info *phy_info)
+{
+	switch (phy_info->tx_eq_states) {
+	case TX_EQ_NONE:
+		return "TX_EQ_NONE";
+	case TX_EQ_POLL_LP_CMD:
+		return "TX_EQ_POLL_LP_CMD";
+	case TX_EQ_WAIT_HOLD_CMD:
+		return "TX_EQ_WAIT_HOLD_CMD";
+	case TX_EQ_WAIT_LD_VLD:
+		return "TX_EQ_WAIT_LD_VLD";
+	case TX_EQ_WAIT_LD_INVLD:
+		return "TX_EQ_WAIT_LD_INVLD";
+	case TX_EQ_LP_RDY:
+		return "TX_EQ_LP_RDY";
+	default:
+		return "error state";
+	}
+	return "error state";
+}
+
+static int dn200_tx_eq_state_mach(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0, ctrl_val = 0;
+	u16 cff_upd0 = 0, cff_upd1 = 0, cff_updtm1 = 0;
+
+	netdev_dbg(phy_info->dev, "NOW tx_eq_states(%d): %s\n",
+		   phy_info->tx_eq_states,
+		   dn200_txeq_state_to_string(phy_info));
+	switch (phy_info->tx_eq_states) {
+	case TX_EQ_NONE:
+		break;
+	case TX_EQ_POLL_LP_CMD:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_LP_CESTS);
+		if (reg_val & LP_RR) {
+			netdev_dbg(phy_info->dev, "TX: recv lp rx_rdy %x\n",
+				   reg_val);
+			phy_info->tx_eq_states = TX_EQ_LP_RDY;
+			break;
+		}
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_LP_CEU);
+		if ((reg_val & LP_PRST) || (reg_val & LP_INIT)
+		    || (reg_val & GENMASK(5, 0))) {
+			ctrl_val =
+			    ((reg_val & LP_PRST) | (reg_val & LP_INIT) |
+			     (reg_val & GENMASK(5, 0)));
+			netdev_dbg(phy_info->dev,
+				   "TX: %d preset(%d) INIT(%d) c+1(%d) c0(%d) c-1(%d)\n",
+				   __LINE__, !!(reg_val & LP_PRST),
+				   !!(reg_val & LP_INIT),
+				   (u16) (reg_val & GENMASK(5, 4)) >> 4,
+				   (u16) (reg_val & GENMASK(3, 2)) >> 2,
+				   (u16) (reg_val & GENMASK(1, 0)));
+			dn200_xpcs_write(phy_info, phy_info->phy_addr,
+					 MDIO_MMD_PMAPMD,
+					 VR_PMA_KRTR_TX_EQ_CFF_CTRL, ctrl_val);
+			phy_info->tx_eq_states = TX_EQ_WAIT_LD_VLD;
+		}
+		break;
+	case TX_EQ_WAIT_LD_VLD:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, VR_PMA_PHY_TX_EQ_STS);
+		if (reg_val & CFF_UPDT_VLD_ALL) {
+			ctrl_val = 0;
+			if (reg_val & CFF_UPDT1_VLD) {
+				cff_upd1 = ((reg_val & CFF_UPDT1) >> 4);
+				if (cff_upd1 == 0x2)
+					cff_upd1 = 0;
+				ctrl_val |= (cff_upd1 << 4);
+			}
+			if (reg_val & CFF_UPDT0_VLD) {
+				cff_upd0 = ((reg_val & CFF_UPDT0) >> 2);
+				if (cff_upd0 == 0x2)
+					cff_upd0 = 0;
+				ctrl_val |= (cff_upd0 << 2);
+			}
+			if (reg_val & CFF_UPDTM1_VLD) {
+				cff_updtm1 = (reg_val & CFF_UPDTM1);
+				if (cff_updtm1 == 0x2)
+					cff_updtm1 = 0;
+				ctrl_val |= (cff_updtm1);
+			}
+			reg_val =
+			    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					    MDIO_MMD_PMAPMD,
+					    VR_PMA_KRTR_TX_EQ_STS_CTRL);
+			reg_val &= ~(GENMASK(5, 0));
+			reg_val |= ctrl_val;
+			netdev_dbg(phy_info->dev,
+				   "TX: %d c+1(%d) c0(%d) c-1(%d) send update\n",
+				   __LINE__,
+				   (u16) (reg_val & GENMASK(5, 4)) >> 4,
+				   (u16) (reg_val & GENMASK(3, 2)) >> 2,
+				   (u16) (reg_val & GENMASK(1, 0)));
+			dn200_xpcs_write(phy_info, phy_info->phy_addr,
+					 MDIO_MMD_PMAPMD,
+					 VR_PMA_KRTR_TX_EQ_STS_CTRL, reg_val);
+			phy_info->tx_eq_states = TX_EQ_WAIT_HOLD_CMD;
+		}
+		break;
+	case TX_EQ_WAIT_HOLD_CMD:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_LP_CEU);
+		if (((reg_val & LP_PRST) | (reg_val & LP_INIT) |
+		     (reg_val & GENMASK(5, 0))) == 0) {
+			netdev_dbg(phy_info->dev,
+				   "TX: %d SR_PMA_KR_LP_CEU %x recv hold done\n",
+				   __LINE__, reg_val);
+			dn200_xpcs_write(phy_info, phy_info->phy_addr,
+					 MDIO_MMD_PMAPMD,
+					 VR_PMA_KRTR_TX_EQ_CFF_CTRL, 0);
+			phy_info->tx_eq_states = TX_EQ_WAIT_LD_INVLD;
+		}
+		break;
+	case TX_EQ_WAIT_LD_INVLD:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, VR_PMA_PHY_TX_EQ_STS);
+		if (!(reg_val & CFF_UPDT_VLD_ALL)) {
+			netdev_dbg(phy_info->dev,
+				   "TX: %d VR_PMA_PHY_TX_EQ_STS %x send notupdate\n",
+				   __LINE__, reg_val);
+			reg_val =
+			    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					    MDIO_MMD_PMAPMD,
+					    VR_PMA_KRTR_TX_EQ_STS_CTRL);
+			reg_val &= ~(GENMASK(5, 0));
+			dn200_xpcs_write(phy_info, phy_info->phy_addr,
+					 MDIO_MMD_PMAPMD,
+					 VR_PMA_KRTR_TX_EQ_STS_CTRL, reg_val);
+			phy_info->tx_eq_states = TX_EQ_POLL_LP_CMD;
+		}
+		break;
+	case TX_EQ_LP_RDY:
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static int dn200_rx_eq_state_mach(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val;
+
+	netdev_dbg(phy_info->dev, "NOW rx_eq_states(%d): %s\n",
+		   phy_info->rx_eq_states,
+		   dn200_rxeq_state_to_string(phy_info));
+	switch (phy_info->rx_eq_states) {
+	case RX_EQ_NONE:
+		/*program INT to LP */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_PMA_KRTR_RX_EQ_CTRL,
+				   GENMASK(5, 0), 0, 0);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_PMA_KRTR_RX_EQ_CTRL,
+				   GENMASK(6, 6), 6, 1);
+		phy_info->rx_eq_states = RX_EQ_WAIT_UPDATE;
+		netdev_dbg(phy_info->dev, "RX: send init\n");
+		usleep_range(100, 200);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_PMA_KRTR_RX_EQ_CTRL,
+				   GENMASK(6, 6), 6, 0);
+		break;
+	case RX_EQ_WAIT_UPDATE:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_LP_CESTS);
+		/*wait LP update */
+		if ((reg_val & GENMASK(5, 0)) == 0b010101) {
+			netdev_dbg(phy_info->dev,
+				   "RX: SR_PMA_KR_LP_CESTS %x update done\n",
+				   reg_val);
+			phy_info->rx_eq_states = RX_EQ_SEND_HOLD;
+		}
+		break;
+	case RX_EQ_SEND_HOLD:
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_PMA_KRTR_RX_EQ_CTRL,
+				   GENMASK(5, 0), 0, 0);
+		netdev_dbg(phy_info->dev, "RX: send update\n");
+		phy_info->rx_eq_states = RX_EQ_WAIT_NOTUPDATE;
+		break;
+	case RX_EQ_WAIT_NOTUPDATE:
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_LP_CESTS);
+		/*wait LP not update */
+		if ((reg_val & GENMASK(5, 0)) == 0) {
+			netdev_dbg(phy_info->dev,
+				   "RX: SR_PMA_KR_LP_CESTS %x NOTUPDATE done\n",
+				   reg_val);
+			phy_info->rx_eq_states = RX_EQ_POLL_COF;
+		}
+		break;
+	case RX_EQ_POLL_COF:
+		dn200_rx_eq_process(phy_info);
+		break;
+	case RX_EQ_LD_NOCMD:
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_PMA_KRTR_RX_EQ_CTRL,
+				   GENMASK(8, 8), 8, 1);
+		netdev_dbg(phy_info->dev, "RX:local set rr rdy\n");
+		break;
+	case RX_EQ_READY:
+		break;
+	default:
+		netdev_info(phy_info->dev, "err state %d\n",
+			    phy_info->rx_eq_states);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void dn200_kr_train_disable(struct dn200_phy_info *phy_info)
+{
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_XS_PMA_MP_32G_RX_EQ_CTRL4, GENMASK(12, 12), 12,
+			   0);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_RX_EQ_CTRL, GENMASK(8, 8), 8, 1);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_RX_EQ_CTRL, GENMASK(15, 15), 15, 0);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_TX_EQ_STS_CTRL, GENMASK(15, 15), 15, 0);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_XS_PMA_MP_32G_RX_EQ_CTRL4, GENMASK(10, 8), 8, 0);
+	/*Enable train */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   SR_PMA_KR_PMD_CTRL, GENMASK(1, 1), 1, 0);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   SR_PMA_KR_PMD_CTRL, GENMASK(0, 0), 0, 1);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_VEND2,
+			   SR_MII_CTRL, AN_ENABLE, AN_ENABLE_SHIFT, 0);
+}
+
+static int dn200_rx_train_sw_process(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+
+	phy_info->rx_eq_states = RX_EQ_NONE;
+	phy_info->tx_eq_states = TX_EQ_NONE;
+	usleep_range(1000, 2000);
+	netdev_dbg(phy_info->dev, "start sw kt process  %x\n", reg_val);
+	/*RR RDY */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_RX_EQ_CTRL, RR_RDY, 8, 0);
+	/*Enable MM */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_RX_EQ_CTRL, RX_EQ_MM, 15, 1);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_PMA_KRTR_TX_EQ_STS_CTRL, TX_EQ_MM, 15, 1);
+	/*enable ping-pong mode */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_XS_PMA_MP_32G_RX_EQ_CTRL4, GENMASK(10, 8), 8,
+			   0x7);
+	/*Enable train */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   SR_PMA_KR_PMD_CTRL, TR_EN, TR_EN_S, 1);
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   SR_PMA_KR_PMD_CTRL, RS_TR, RS_TR_S, 1);
+	phy_info->tr_timeout = jiffies + DN200_KT_TRAIN_TIME;
+	usleep_range(10, 20);
+	/*rx_ad_req start */
+	dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			   VR_XS_PMA_MP_32G_RX_EQ_CTRL4, RX_AD_REQ, RX_AD_REQ_S,
+			   1);
+	usleep_range(10, 20);
+
+	while (true) {
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr,
+				    MDIO_MMD_PMAPMD, SR_PMA_KR_PMD_STS);
+		if (reg_val & (BIT(3))) {
+			netdev_dbg(phy_info->dev, "%s link train timeout\n",
+				    __func__);
+			dn200_kr_train_disable(phy_info);
+			return -ETIMEDOUT;
+		} else if (reg_val & BIT(0)) {
+			netdev_dbg(phy_info->dev, "local recv ready %x\n",
+				   reg_val);
+			phy_info->rx_eq_states = RX_EQ_READY;
+		}
+		if (reg_val & BIT(1)) {
+			if (phy_info->tx_eq_states == TX_EQ_NONE)
+				phy_info->tx_eq_states = TX_EQ_POLL_LP_CMD;
+		}
+		if (time_after(phy_info->tr_timeout, jiffies)) {
+			if (phy_info->tx_eq_states == TX_EQ_LP_RDY &&
+			    phy_info->rx_eq_states == RX_EQ_READY) {
+				netdev_dbg(phy_info->dev,
+					    "link train success\n");
+				return 0;
+			}
+			dn200_rx_eq_state_mach(phy_info);
+			dn200_tx_eq_state_mach(phy_info);
+			usleep_range(10, 20);
+		} else {
+			netdev_dbg(phy_info->dev, "%s link train timeout\n",
+				    __func__);
+			dn200_kr_train_disable(phy_info);
+			return -ETIMEDOUT;
+		}
+	}
+	return 0;
+}
+
+static int dn200_phy_link_train_config(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+
+	if (phy_info->an) {
+		dn200_xpcs_an_config(phy_info);
+		reg_val =
+			dn200_xpcs_read(phy_info, phy_info->phy_addr,
+					MDIO_MMD_PMAPMD,
+					SR_PMA_KR_FEC_CTRL);
+		reg_val &=
+			~(MDIO_PMA_10GBR_FECABLE_ABLE |
+				MDIO_PMA_10GBR_FECABLE_ERRABLE);
+		dn200_xpcs_write(phy_info, phy_info->phy_addr,
+					MDIO_MMD_PMAPMD, SR_PMA_KR_FEC_CTRL,
+					reg_val);
+		queue_work(phy_info->dev_workqueue,
+				&phy_info->kr_train_work);
+	}
+	return 0;
+}
+
+static void dn200_kr_train_work(struct work_struct *work)
+{
+	struct dn200_phy_info *phy_info = container_of(work,
+						       struct dn200_phy_info,
+						       kr_train_work);
+
+	/*someone else is in init, wait until next timer event */
+	if (test_and_set_bit(DN200_PHY_IN_TRAIN, &phy_info->phy_state))
+		return;
+
+	dn200_10G_rx_train_set(phy_info);
+
+	clear_bit(DN200_PHY_IN_TRAIN, &phy_info->phy_state);
+}
+static int dn200_link_status_get(struct dn200_phy_info *phy_info)
+{
+	int ret = 0;
+	u16 reg_val = 0;
+	u16 block_err;
+
+	if (phy_info->phydev) {
+		/* Check external PHY */
+		ret = phy_read_status(phy_info->phydev);
+		if (ret < 0)
+			return 0;
+
+		if (phy_info->an == AUTONEG_ENABLE &&
+		    !phy_aneg_done(phy_info->phydev))
+			return 0;
+
+		return phy_info->phydev->link;
+	}
+	if (phy_info->speed == SPEED_10000) {
+		block_err =
+		    dn200_phy_read(phy_info, MDIO_MMD_PCS,
+				   MDIO_PCS_10GBRT_STAT2) & 0xFF;
+		if (!phy_info->an_sucess)
+			reg_val =
+			    dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+		if ((block_err)
+		    || (reg_val & (XS_PCS_STS2_RF | XS_PCS_STS2_TF))) {
+			if (phy_info->blk_err_ck < DN200_MAX_BLK_ERR_CNT) {
+				phy_info->blk_err_ck++;
+			} else {
+				if (block_err != 0) {
+					netdev_dbg(phy_info->dev,
+						   "Check high block err %x !!, Please check the optical\n",
+						   block_err);
+				}
+				if (reg_val & (XS_PCS_STS2_RF | XS_PCS_STS2_TF)) {
+					netdev_dbg(phy_info->dev,
+						   "fault detect %x !!\n",
+						   reg_val);
+					if (phy_info->speed == SPEED_1000)
+						dn200_xpcs_switch_to_1G(phy_info);
+					else
+						dn200_xpcs_switch_to_10G(phy_info);
+					phy_info->blk_err_ck = false;
+					goto LINK_DOWN;
+				}
+			}
+		}
+	}
+	/* Link status is latched low, so read once to clear
+	 * and then read again to get current state
+	 */
+	reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT1);
+	if (!(reg_val & MDIO_STAT1_LSTATUS))
+		goto LINK_DOWN;
+	if (reg_val & MDIO_STAT1_LSTATUS) {
+		if (phy_info->phy_interface == PHY_INTERFACE_MODE_XGMII) {
+			if (phy_info->speed == SPEED_10000) {
+				reg_val =
+				    dn200_phy_read(phy_info, MDIO_MMD_PCS,
+						   MDIO_PCS_10GBRT_STAT1);
+				if (reg_val & XPCS_10G_PLU) {
+					reg_val = 0;
+					/*read Transmitter Fault and Receiver Fault,
+					 * LH type read twice
+					 */
+					reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+					if (!(reg_val & (XS_PCS_STS2_RF | XS_PCS_STS2_TF)))
+						return DN200_LINK_UP;
+					netdev_dbg(phy_info->dev,
+							"fault detect %x !!\n",
+							reg_val);
+				}
+			}
+			if (phy_info->speed == SPEED_1000) {
+				reg_val =
+				    dn200_phy_read(phy_info, MDIO_MMD_PCS,
+						   XS_PCS_LSTS);
+				if (reg_val & MDIO_PHYXS_LNSTAT_ALIGN) {
+					/*read Receiver Fault, LH type read twice */
+					reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+					if (!(reg_val & (XS_PCS_STS2_RF)))
+						return DN200_LINK_UP;
+				}
+			}
+		} else if (phy_info->phy_interface == PHY_INTERFACE_MODE_GMII) {
+			reg_val =
+			    dn200_phy_read(phy_info, MDIO_MMD_PCS, XS_PCS_LSTS);
+			reg_val =
+			    dn200_phy_read(phy_info, MDIO_MMD_PCS, XS_PCS_LSTS);
+			if (reg_val & MDIO_PHYXS_LNSTAT_ALIGN) {
+				/*read Receiver Fault, LH type read twice */
+				reg_val = dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT2);
+				if (!(reg_val & (XS_PCS_STS2_RF)))
+					return DN200_LINK_UP;
+			}
+		}
+	}
+LINK_DOWN:
+	/* if an is success, but link is down, close AN and reset phy */
+	if (phy_info->an_sucess) {
+		phy_info->an_sucess = false;
+		phy_info->speed = phy_info->setting_speed;
+		dn200_kr_train_disable(phy_info);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+				VR_RX_GENCTRL0, RX_DT_EN_0_MASK, RX_DT_EN_0_S, 1);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S, 0);
+		/* reset link status for AN failure caused by "block lock fail" */
+		if (phy_info->speed == SPEED_1000)
+			dn200_xpcs_switch_to_1G(phy_info);
+		else
+			dn200_xpcs_switch_to_10G(phy_info);
+	}
+	return DN200_LINK_DOWN;
+}
+
+static int dn200_phy_xpcs_an73_result(struct dn200_phy_info *phy_info)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	//enum dn200_mode mode;
+	unsigned int ad_reg, lp_reg, speed_modes;
+
+	DN200_SET_LP_ADV(lks, Autoneg);
+	DN200_SET_LP_ADV(lks, Backplane);
+
+	/* Compare Advertisement and Link Partner register 1 */
+	ad_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    MDIO_AN_ADVERTISE);
+	lp_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    MDIO_AN_LPA);
+	if (lp_reg & ADVERTISE_PAUSE_CAP)
+		DN200_SET_LP_ADV(lks, Pause);
+	if (lp_reg & ADVERTISE_PAUSE_ASYM)
+		DN200_SET_LP_ADV(lks, Asym_Pause);
+
+	if (phy_info->an) {
+		/* Set flow control based on auto-negotiation result */
+		phy_info->pause = MLO_PAUSE_NONE;
+		if (ad_reg & lp_reg & ADVERTISE_PAUSE_CAP) {
+			phy_info->pause |= MLO_PAUSE_TX;
+			phy_info->pause |= MLO_PAUSE_RX;
+		} else if (ad_reg & lp_reg & ADVERTISE_PAUSE_ASYM) {
+			if (ad_reg & ADVERTISE_PAUSE_CAP)
+				phy_info->pause |= MLO_PAUSE_RX;
+			else if (lp_reg & ADVERTISE_PAUSE_CAP)
+				phy_info->pause |= MLO_PAUSE_TX;
+		}
+		priv->flow_ctrl = phy_info->pause;
+	}
+
+	/* Compare Advertisement and Link Partner register 2 */
+	ad_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    SR_AN_ADV2);
+	lp_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    SR_AN_LP_ABL2);
+	if (lp_reg & C73_10000KR)
+		DN200_SET_LP_ADV(lks, 10000baseKR_Full);
+	if (lp_reg & C73_1000KX)
+		DN200_SET_LP_ADV(lks, 1000baseKX_Full);
+
+	ad_reg &= lp_reg;
+	if (ad_reg & C73_10000KR)
+		speed_modes = SPEED_10000;
+	else if (ad_reg & C73_1000KX)
+		speed_modes = SPEED_1000;
+	else
+		speed_modes = SPEED_UNKNOWN;
+
+	/* Compare Advertisement and Link Partner register 3 */
+	ad_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    SR_AN_ADV3);
+	lp_reg =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			    SR_AN_LP_ABL3);
+	if (lp_reg & (C73_LP_FEC_EN | C73_NEED_FEC_EN))
+		DN200_SET_LP_ADV(lks, 10000baseR_FEC);
+
+	return speed_modes;
+}
+
+static const char *dn200_phy_fc_string(struct dn200_phy_info *phy_info)
+{
+	if ((phy_info->pause & MLO_PAUSE_TX) && (phy_info->pause & MLO_PAUSE_RX))
+		return "rx/tx";
+	else if (phy_info->pause & MLO_PAUSE_RX)
+		return "rx";
+	else if (phy_info->pause & MLO_PAUSE_TX)
+		return "tx";
+	else
+		return "off";
+}
+
+static const char *dn200_phy_speed_string(int speed)
+{
+	switch (speed) {
+	case SPEED_10:
+		return "10Mbps";
+	case SPEED_100:
+		return "100Mbps";
+	case SPEED_1000:
+		return "1Gbps";
+	case SPEED_2500:
+		return "2.5Gbps";
+	case SPEED_10000:
+		return "10Gbps";
+	case SPEED_UNKNOWN:
+		return "Unknown";
+	default:
+		return "Unsupported";
+	}
+}
+
+static void dn200_phy_print_status(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->link_status)
+		netdev_info(phy_info->dev,
+			    "Link is Up - %s/%s - flow control %s\n",
+			    dn200_phy_speed_string(phy_info->speed),
+			    phy_info->dup == DN200_DUP_FULL ? "Full" : "Half",
+			    dn200_phy_fc_string(phy_info));
+	else
+		netdev_info(phy_info->dev, "Link is Down\n");
+}
+
+static int dn200_phy_info_state_change(struct dn200_phy_info *phy_info)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	bool tx_pause, rx_pause;
+
+	/*Reset link parnter advertising */
+	DN200_ZERO_LP_ADV(lks);
+	if (phy_info->phydev) {
+		if (priv->flow_ctrl_an &&
+			!extern_phy_pause_autoneg_result(phy_info->phydev, &tx_pause, &rx_pause)) {
+			phy_info->pause &= ~(MLO_PAUSE_RX | MLO_PAUSE_TX);
+			if (tx_pause)
+				phy_info->pause |= MLO_PAUSE_TX;
+			if (rx_pause)
+				phy_info->pause |= MLO_PAUSE_RX;
+		}
+		bitmap_copy(lks->link_modes.lp_advertising,
+			    phy_info->phydev->lp_advertising,
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);
+		bitmap_copy(lks->link_modes.advertising,
+			    phy_info->phydev->advertising,
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);
+		priv->flow_ctrl = phy_info->pause;
+		phy_info->speed = phy_info->phydev->speed;
+		phy_info->dup = phy_info->phydev->duplex;
+	} else {
+		if (phy_info->an && phy_info->an_sucess)
+			phy_info->speed = dn200_phy_xpcs_an73_result(phy_info);
+		//phy_info->speed = SPEED_1000;
+		phy_info->dup = DN200_DUP_FULL;
+	}
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		struct dn200_sriov_phy_info sriov_info;
+
+		sriov_info.speed = phy_info->speed;
+		sriov_info.dup = phy_info->dup;
+		sriov_info.media_type = phy_info->media_type;
+		sriov_info.an = phy_info->an;
+		sriov_info.pause = phy_info->pause;
+		sriov_info.phy_interface = phy_info->phy_interface;
+		sriov_info.link_modes = phy_info->link_modes;
+		dn200_set_phy_info(((struct dn200_priv *)
+				    netdev_priv(phy_info->dev))->hw,
+				   &sriov_info);
+	}
+
+	return 0;
+}
+
+static void dn200_phy_speed_self_adapt(struct dn200_phy_info *phy_info)
+{
+	if (!phy_info->xpcs)
+		return;
+	/*link up, do not need to change speed */
+	if (phy_info->link_status || phy_info->an_sucess)
+		return;
+	/*wait 2s to link at highest speed */
+	if (time_after(phy_info->speed_reset_time, jiffies))
+		return;
+	phy_info->speed_reset_time = jiffies + DN200_SFP_RESET_TIME;
+	/* firstly: old phy speed is 10G and sfp module support 10G, wait 1s for link at 10G
+	 *secondly: old speed is 1G and sfp module support 10G, switch to 10G immediately
+	 *than: old speed is 10G and sfp module support 1G, switch to 1G immediately
+	 *lastly: old speed is 1G and sfp module support 1G, wait 1s for link at 1G
+	 */
+	if (phy_info->speed == SPEED_10000 &&
+	    (phy_info->sfp_speed & DN200_SFP_SPEED_10000)) {
+		/*delay 10s to start multispeed work */
+		queue_delayed_work(phy_info->dev_workqueue,
+				   &phy_info->phy_multispeed_work,
+				   msecs_to_jiffies(2000));
+	} else if ((phy_info->speed == SPEED_1000) &&
+		   (phy_info->sfp_speed & DN200_SFP_SPEED_10000)) {
+		/*delay 1ms to start multispeed work */
+		queue_delayed_work(phy_info->dev_workqueue,
+				   &phy_info->phy_multispeed_work,
+				   msecs_to_jiffies(1));
+	} else if ((phy_info->speed == SPEED_10000) &&
+		   (phy_info->sfp_speed & DN200_SFP_SPEED_1000)) {
+		/*delay 1ms to start multispeed work */
+		queue_delayed_work(phy_info->dev_workqueue,
+				   &phy_info->phy_multispeed_work,
+				   msecs_to_jiffies(1));
+	} else if ((phy_info->speed == SPEED_1000) &&
+		   (phy_info->sfp_speed & DN200_SFP_SPEED_1000)) {
+		/*delay 1ms to start multispeed work */
+		queue_delayed_work(phy_info->dev_workqueue,
+				   &phy_info->phy_multispeed_work,
+				   msecs_to_jiffies(1000));
+	}
+}
+
+static bool dn200_mac_dma_link_check(struct dn200_priv *priv, struct dn200_phy_info *phy_info)
+{
+	u32 reg_val = 0;
+	u32 mtl_debug = 0, dma_debug = 0;
+	int queue_count = 0;
+	int i = 0;
+
+	reg_val = readl(priv->ioaddr + XGMAC_MAC_DEBUG);
+
+	if (reg_val) {
+		netdev_dbg(phy_info->dev, "%s %d mac debug %#x\n", __func__, __LINE__, reg_val);
+		if (reg_val & XGMAC_MAC_RX_FIFO_ACT) {
+			/*check rx fifo and dma valid*/
+			queue_count = priv->plat_ex->rx_queues_total;
+			for (i = 0; i < queue_count; i++) {
+				mtl_debug = readl(priv->ioaddr + XGMAC_MTL_RXQ_DEBUG(i));
+				dma_debug = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(i));
+				if (mtl_debug ||
+					((dma_debug & XGMAC_RXDMA_FSM_STATE_MASK) != XGMAC_RXDMA_FSM_STATE &&
+						dma_debug != 0)) {
+					netdev_dbg(phy_info->dev, "%s %d queue %d mtl_debug %#x dma_debug %#x\n",
+							__func__, __LINE__, i, mtl_debug, dma_debug);
+					return false;
+				}
+			}
+		}
+		if (reg_val & XGMAC_MAC_TX_FIFO_ACT) {
+			/*check rx fifo and dma valid*/
+			queue_count = priv->plat_ex->tx_queues_total;
+			for (i = 0; i < queue_count; i++) {
+				mtl_debug = readl(priv->ioaddr + XGMAC_MTL_TXQ_DEBUG(i));
+				dma_debug = readl(priv->ioaddr + XGMAC_CH_DEBUG_ST(i));
+				if (mtl_debug ||
+					((dma_debug & XGMAC_TXDMA_FSM_STATE_MASK) != XGMAC_TXDMA_FSM_STATE &&
+						dma_debug != 0)) {
+					netdev_dbg(phy_info->dev, "%s %d queue %d mtl_debug %#x dma_debug %#x\n",
+							__func__, __LINE__, i, mtl_debug, dma_debug);
+					if (!priv->mii)
+						return false;
+				}
+			}
+		}
+		if (!phy_info->mac_debug_active) {
+			phy_info->mac_debug_active = true;
+		} else {
+			dn200_global_err(priv, DN200_DMA_DEBUG_ERR);
+			phy_info->mac_debug_active = false;
+		}
+
+		return false;
+	}
+	/*dma operation normal, can link up*/
+	return true;
+}
+
+static void dn200_clear_lks(struct dn200_phy_info *phy_info, int type, bool is_sup)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+
+	if (type & DN200_SFP_TYPE_10000) {
+		if (is_sup) {
+			DN200_CLR_SUP(lks, 10000baseKR_Full);
+			DN200_CLR_SUP(lks, 10000baseCR_Full);
+			DN200_CLR_SUP(lks, 10000baseLR_Full);
+			DN200_CLR_SUP(lks, 10000baseER_Full);
+			DN200_CLR_SUP(lks, 10000baseLRM_Full);
+			DN200_CLR_SUP(lks, 10000baseSR_Full);
+		} else {
+			DN200_CLR_ADV(lks, 10000baseKR_Full);
+			DN200_CLR_ADV(lks, 10000baseCR_Full);
+			DN200_CLR_ADV(lks, 10000baseLR_Full);
+			DN200_CLR_ADV(lks, 10000baseER_Full);
+			DN200_CLR_ADV(lks, 10000baseLRM_Full);
+			DN200_CLR_ADV(lks, 10000baseSR_Full);
+		}
+	}
+	if (type & DN200_SFP_TYPE_1000) {
+		if (is_sup) {
+			DN200_CLR_SUP(lks, 1000baseT_Full);
+			DN200_CLR_SUP(lks, 100baseT_Full);
+			DN200_CLR_SUP(lks, 1000baseX_Full);
+		} else {
+			DN200_CLR_ADV(lks, 1000baseT_Full);
+			DN200_CLR_ADV(lks, 100baseT_Full);
+			DN200_CLR_ADV(lks, 1000baseX_Full);
+		}
+	}
+}
+
+static int dn200_phy_status(struct dn200_phy_info *phy_info)
+{
+	bool old_link = phy_info->link_status;
+	bool link_status;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	u8 sfp_rx_los = phy_info->sfp_rx_los;
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+
+	if (!test_bit(DN200_PHY_STARTED, &phy_info->phy_state))
+		return 0;
+	if (test_bit(DN200_PHY_IN_RESET, &phy_info->phy_state))
+		return 0;
+	if (!(phy_info->dev->flags & IFF_UP))
+		return 0;
+
+	if (phy_info->phydev) {
+		extern_phy_read_status(phy_info->phydev);
+		link_status = phy_info->phydev->link;
+		if (phy_info->phy_loopback_flag) {
+			link_status = DN200_LINK_UP;
+			phy_info->phydev->speed = SPEED_1000;
+			phy_info->phydev->duplex = DUPLEX_FULL;
+			if (phy_info->last_link_speed != phy_info->speed)
+				old_link = DN200_LINK_DOWN;
+		}
+		phy_info->speed = phy_info->phydev->speed;
+		phy_info->dup = phy_info->phydev->duplex;
+		if (phy_info->self_adap_reset) {
+			phy_info->self_adap_reset = false;
+			link_status = DN200_LINK_DOWN;
+		}
+	} else {
+		if (phy_info->self_adap_reset)
+			return 0;
+		dn200_phy_sfp_present(phy_info);
+		dn200_phy_sfp_rx_los(phy_info);
+		dn200_phy_sfp_tx_falut(phy_info);
+		if (phy_info->sfp_mod_absent || phy_info->sfp_rx_los
+		    || phy_info->sfp_tx_falut) {
+			dev_dbg(priv->device, "sfp_mod_absent =%d, sfp_rx_los = %d, sfp_tx_fault = %d\n",
+				phy_info->sfp_mod_absent, phy_info->sfp_rx_los, phy_info->sfp_tx_falut);
+			link_status = DN200_LINK_DOWN;
+			phy_info->recfg_an = true;
+			if (phy_info->sfp_mod_absent) {
+				phy_info->sfp_speed = DN200_SFP_SPEED_UNKNOWN;
+				phy_info->sfp_changed = true;
+				dn200_clear_lks(phy_info, DN200_SFP_TYPE_1000 | DN200_SFP_TYPE_10000, true);
+				DN200_SET_SUP(lks, 1000baseX_Full);
+				DN200_SET_SUP(lks, 10000baseKR_Full);
+				DN200_LM_COPY(lks, advertising, lks, supported);
+				phy_info->link_modes = DN300_10000baseKR_Full | DN300_1000BASEX_Full;
+			}
+			if (phy_info->phy_multispeed_work.work.func)
+				cancel_delayed_work_sync(&phy_info->phy_multispeed_work);
+			if (phy_info->kr_train_work.func)
+				cancel_work_sync(&phy_info->kr_train_work);
+		} else {
+			/*sfp rx power detect, reinit an */
+			if (sfp_rx_los && !phy_info->sfp_rx_los) {
+				if (phy_info->phy_ops->identity(phy_info) < 0)
+					return DN200_LINK_DOWN;
+				/*sfp only support 1G module, switch to 1G*/
+				if (phy_info->sfp_speed == DN200_SFP_SPEED_1000 && phy_info->speed != SPEED_1000)
+					dn200_speed_set(phy_info, SPEED_1000);
+			}
+
+			/*autoneg on, try auto firstly */
+			if (phy_info->an && phy_info->recfg_an) {
+				dn200_phy_link_train_config(phy_info);
+				phy_info->recfg_an = false;
+				return DN200_LINK_DOWN;
+			}
+			dn200_phy_speed_self_adapt(phy_info);
+			link_status = dn200_link_status_get(phy_info);
+		}
+		if (link_status &&
+			(phy_info->last_link_speed != phy_info->speed)) {
+			phy_info->self_adap_reset = true;
+			phy_info->last_link_speed = phy_info->speed;
+			/* Stop the I2C controller, avoid link-partner's link status
+			 * has undergone multiple changes after switch speed
+			 * when vfs has been enabled.
+			 */
+			phy_info->sfp_tx_disable = 1;
+			dn200_phy_set_sfp_tx_disable(phy_info);
+			dn200_normal_reset(priv);
+			return 0;
+		}
+	}
+	phy_info->phy_ops->blink_control(phy_info, link_status);
+	if (old_link == link_status)
+		return 0;
+	/* sfp only support 10G, but can change 1000,and link*/
+	if (phy_info->sfp_speed == DN200_SFP_SPEED_10000 && phy_info->speed == SPEED_1000) {
+		dn200_clear_lks(phy_info, DN200_SFP_TYPE_10000, true);
+		phy_info->link_modes = phy_info->link_modes & 0x7;
+		DN200_SET_ADV(lks, 1000baseX_Full);
+		DN200_SET_SUP(lks, 1000baseX_Full);
+	}
+	if (link_status) {
+		if (!dn200_mac_dma_link_check(priv, phy_info))
+			return 0;
+	}
+	phy_info->blk_err_ck = false;
+	phy_info->phy_ops->led_control(phy_info, link_status);
+	phy_info->link_status = link_status;
+	dn200_phy_info_state_change(phy_info);
+	dn200_phy_print_status(phy_info);
+	if (PRIV_SRIOV_SUPPORT(priv))
+		dn200_pf_set_link_status(priv->hw, phy_info->link_status);
+	if (link_status) {
+		if (phy_info->phy_multispeed_work.work.func)
+			cancel_delayed_work_sync(&phy_info->phy_multispeed_work);
+		phy_info->mac_ops->mac_link_up(priv, phy_info->phydev, 0,
+					       phy_info->phy_interface,
+					       phy_info->speed, phy_info->dup,
+					       phy_info->pause & MLO_PAUSE_TX,
+					       phy_info->pause & MLO_PAUSE_RX);
+		clear_bit(DN200_PHY_SFP_NEED_RESET, &phy_info->phy_state);
+		netif_carrier_on(phy_info->dev);
+		netif_tx_start_all_queues(priv->dev);
+		linkwatch_fire_event(phy_info->dev);
+		phy_info->last_link_speed = phy_info->speed;
+		phy_info->phy_status_time_intr = DN200_PHY_STATUS_NINTR;
+	} else {
+		netif_carrier_off(phy_info->dev);
+		netif_tx_stop_all_queues(priv->dev);
+		linkwatch_fire_event(phy_info->dev);
+		phy_info->mac_ops->mac_link_down(priv, 0,
+						 phy_info->phy_interface);
+		clear_bit(DN200_PHY_SFP_INITED, &phy_info->phy_state);
+		set_bit(DN200_PHY_SFP_NEED_RESET, &phy_info->phy_state);
+		if (netif_msg_ifdown(priv))
+			dn200_xpcs_link_down_reg_dump(phy_info);
+		phy_info->phy_status_time_intr = DN200_PHY_STATUS_DINTR;
+	}
+	return 0;
+}
+
+irqreturn_t dn200_phy_status_isr(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct dn200_priv *priv = netdev_priv(dev);
+
+	dn200_phy_status(PRIV_PHY_INFO(priv));
+	return IRQ_HANDLED;
+}
+
+static void dn200_vf_get_phy_info(struct dn200_phy_info *phy_info)
+{
+	struct dn200_sriov_phy_info sriov_info = { 0 };
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+
+	linkmode_zero(lks->link_modes.lp_advertising);
+	linkmode_zero(lks->link_modes.advertising);
+	linkmode_zero(lks->link_modes.supported);
+	dn200_get_phy_info(((struct dn200_priv *)netdev_priv(phy_info->dev))->
+			   hw, &sriov_info);
+
+	phy_info->speed = sriov_info.speed;
+	phy_info->dup = sriov_info.dup;
+	phy_info->port_type = -1;
+	phy_info->media_type = sriov_info.media_type;
+	phy_info->an = AUTONEG_DISABLE;
+	phy_info->pause = sriov_info.pause;
+	phy_info->phy_interface = sriov_info.phy_interface;
+	phy_info->link_modes = sriov_info.link_modes;
+
+	if (phy_info->link_modes & DN300_100BASET_Full)
+		DN200_SET_SUP(lks, 100baseT_Full);
+	if (phy_info->link_modes & DN300_1000BASET_Full)
+		DN200_SET_SUP(lks, 1000baseT_Full);
+	if (phy_info->link_modes & DN300_1000BASEX_Full)
+		DN200_SET_SUP(lks, 1000baseX_Full);
+	if (phy_info->link_modes & DN300_10000baseSR_Full)
+		DN200_SET_SUP(lks, 10000baseSR_Full);
+	if (phy_info->link_modes & DN300_10000baseLRM_Full)
+		DN200_SET_SUP(lks, 10000baseLRM_Full);
+	if (phy_info->link_modes & DN300_10000baseLR_Full)
+		DN200_SET_SUP(lks, 10000baseLR_Full);
+	if (phy_info->link_modes & DN300_10000baseKR_Full)
+		DN200_SET_SUP(lks, 10000baseKR_Full);
+	if (phy_info->link_modes & DN300_10000baseCR_Full)
+		DN200_SET_SUP(lks, 10000baseCR_Full);
+	if (phy_info->link_modes & DN300_10000baseER_Full)
+		DN200_SET_SUP(lks, 10000baseER_Full);
+	DN200_LM_COPY(lks, advertising, lks, supported);
+}
+
+static int dn200_vf_phy_status(struct dn200_phy_info *phy_info)
+{
+	bool old_link = phy_info->link_status;
+	bool link_status;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	link_status = dn200_vf_get_link_status(priv->hw);
+	if (old_link == link_status)
+		return 0;
+	dn200_vf_get_phy_info(phy_info);
+	if (link_status) {
+		phy_info->mac_ops->mac_link_up(priv, phy_info->phydev, 0,
+					       phy_info->phy_interface,
+					       phy_info->speed, phy_info->dup,
+					       phy_info->pause & MLO_PAUSE_TX,
+					       phy_info->pause & MLO_PAUSE_RX);
+		netif_carrier_on(phy_info->dev);
+		netif_tx_start_all_queues(priv->dev);
+	} else {
+		netif_carrier_off(phy_info->dev);
+		netif_tx_stop_all_queues(priv->dev);
+	}
+	phy_info->link_status = link_status;
+
+	dn200_phy_print_status(phy_info);
+	return 0;
+}
+
+static void dn200_speed_set(struct dn200_phy_info *phy_info, u32 speed)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (speed == SPEED_10000) {
+		/*init as 10G */
+		phy_info->speed = SPEED_10000;
+		phy_info->mac_ops->mac_speed_set(priv, phy_info->phy_interface,
+						 phy_info->speed);
+		dn200_xpcs_switch_to_10G(phy_info);
+	} else if (speed == SPEED_1000) {
+		/*init as 1G */
+		phy_info->speed = SPEED_1000;
+		phy_info->mac_ops->mac_speed_set(priv, phy_info->phy_interface,
+						 phy_info->speed);
+		dn200_xpcs_switch_to_1G(phy_info);
+	}
+}
+
+static void dn200_multispeed_setup(struct dn200_phy_info *phy_info)
+{
+	u32 highest_speed = phy_info->setting_speed;
+
+	/*phy stop, just return */
+	if (!test_bit(DN200_PHY_STARTED, &phy_info->phy_state))
+		return;
+	/* this sets the link speed and restarts auto-neg */
+	while (test_and_set_bit(DN200_PHY_IN_SFP_INIT, &phy_info->phy_state))
+		usleep_range(1000, 2000);
+	netdev_dbg(phy_info->dev,
+		   "%s %d link_status %d highest_speed %d sfp_speed %d current speed %d\n",
+		   __func__, __LINE__, phy_info->link_status, highest_speed,
+		   phy_info->sfp_speed, phy_info->speed);
+	if (phy_info->link_status)
+		goto clear_out;
+
+	/*Read to clear block err */
+	dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_PCS_10GBRT_STAT2);
+	/* max speed 10000 and sff support multispeed speed, than init as 10G firstly, otherwise init as 1G secondly
+	 * if both 10 and 1G link failed, restore the default settings.
+	 */
+	if (highest_speed == SPEED_10000
+	    && (phy_info->sfp_speed & DN200_SFP_SPEED_10000)
+	    && phy_info->speed == SPEED_1000) {
+		/*init as 10G firstly */
+		dn200_speed_set(phy_info, SPEED_10000);
+	} else if (((highest_speed == SPEED_10000 &&
+				(phy_info->sfp_speed & DN200_SFP_SPEED_1000)) ||
+				(highest_speed == SPEED_1000 &&
+				(phy_info->sfp_speed & DN200_SFP_SPEED_1000))) &&
+				phy_info->speed == SPEED_10000) {
+		/*init as 1G */
+		dn200_speed_set(phy_info, SPEED_1000);
+	}
+	phy_info->recfg_an = true;
+clear_out:
+	clear_bit(DN200_PHY_IN_SFP_INIT, &phy_info->phy_state);
+}
+
+static void dn200_phy_status_service(struct work_struct *work)
+{
+	struct dn200_phy_info *phy_info = container_of(work,
+						       struct dn200_phy_info,
+						       phy_status_work);
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	bool old_link = phy_info->link_status;
+
+	if (!phy_info->phydev) {
+		/*someone else is in init, wait until next timer event */
+		if (test_bit(DN200_PHY_IN_SFP_INIT, &phy_info->phy_state) ||
+		    test_bit(DN200_PHY_IN_TRAIN, &phy_info->phy_state))
+			return;
+	}
+	phy_info->phy_ops->link_status(phy_info);
+	if (!PRIV_IS_VF(priv) && phy_info->link_status != old_link)
+		fw_link_state_set(&priv->plat_ex->ctrl, phy_info->link_status, phy_info->dup, phy_info->speed);
+}
+
+static void dn200_phy_multispeed_service(struct work_struct *work)
+{
+	struct dn200_phy_info *phy_info = container_of(work,
+						       struct dn200_phy_info,
+						       phy_multispeed_work.work);
+
+	dn200_multispeed_setup(phy_info);
+}
+
+static void dn200_phy_status_timer(struct timer_list *t)
+{
+	struct dn200_phy_info *phy_info =
+	    from_timer(phy_info, t, phy_status_timer);
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	u16 reg_val = 0;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state) ||
+			 test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return;
+	}
+	if (!PRIV_IS_VF(priv) && !phy_info->phydev) {
+		_phy_reg_read(phy_info, 0x20b3 + (3 - phy_info->xpcs_idx) * 0x200, &reg_val);
+		if (!(reg_val & BIT(0))) {
+			netdev_err(priv->dev, "%s :mpllA lock failed\n", __func__);
+			dn200_global_err(priv, DN200_PHY_MPLLA_UNLOCK);
+			return;
+		}
+	}
+	queue_work(phy_info->dev_workqueue, &phy_info->phy_status_work);
+
+	mod_timer(&phy_info->phy_status_timer, jiffies + msecs_to_jiffies(phy_info->phy_status_time_intr));
+}
+
+static void dn200_init_phy_status_timers(struct dn200_phy_info *phy_info)
+{
+	timer_setup(&phy_info->phy_status_timer, dn200_phy_status_timer, 0);
+}
+
+static void dn200_start_phy_status_timers(struct dn200_phy_info *phy_info)
+{
+	mod_timer(&phy_info->phy_status_timer, jiffies + msecs_to_jiffies(phy_info->phy_status_time_intr));
+}
+
+static void dn200_stop_phy_status_timers(struct dn200_phy_info *phy_info)
+{
+	del_timer_sync(&phy_info->phy_status_timer);
+}
+
+static int dn200_xpcs_an_config(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+
+	if (phy_info->an == DN200_AN_DISABLE) {
+		reg_val =
+		    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				    MDIO_CTRL1);
+		reg_val &= (~AN_CTRL_AN_EN);
+		dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				 MDIO_CTRL1, reg_val);
+		return 0;
+	}
+	if (phy_info->an) {
+		/* Update xpcs counter */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_TIMER_CTRL0, GENMASK(15, 0), 0,
+				   0x11e1);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_TIMER_CTRL1, GENMASK(15, 0), 0,
+				   0x9502);
+
+		/*CL73_TMR_OVR_RIDE */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_DIG_CTRL1, CL73_TMR_OVR_RIDE,
+				   CL73_TMR_OVR_RIDE_S, 1);
+
+		/* 1. Disable auto-negotiation first */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S,
+				   0);
+		/* Advertise support speed mode - 10G(Bit7) & 1G(Bit5) */
+		dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				 SR_AN_ADV2, 0);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV2, GENMASK(15, 0), 0, 0x80);
+		/* Provide remote fault to link partner */
+		/*AN_ADV_RF_13 */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV1, AN_ADV_RF_13_MASK,
+				   AN_ADV_RF_13_SHIFT, 1);
+		/* AN_ADV_ACK */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV1, AAN_ADV_ACK_MASK,
+				   AN_ADV_ACK_SHIFT, 0);
+		/*AN_ADV_NP */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   SR_AN_ADV1, AN_ADV_NP, AN_ADV_NP_S, 0);
+
+		/* Clear intr status and enable xpcs intr */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_INTR, GENMASK(2, 0), 0, 0);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_INTR_MSK, GENMASK(2, 0), 0, 0x7);
+
+		/* disable rx data output on lane 0 */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, VR_RX_GENCTRL0,
+				   RX_DT_EN_0_MASK, RX_DT_EN_0_S, 0);
+
+		/*1: disable kr train */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr,
+				   MDIO_MMD_PMAPMD, SR_PMA_KR_PMD_CTRL, TR_EN,
+				   TR_EN_S, 0);
+		/*2: disable an */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S,
+				   0);
+		/*disable an restart */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   MDIO_CTRL1, AN_CTRL_AN_RESTART,
+				   AN_CTRL_AN_RESTART_S, 0);
+		/*disable interrupt */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_INTR_MSK, GENMASK(2, 0), 0, 0);
+		/*3: enable interrupt */
+		/* Clear intr status and enable xpcs intr */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_INTR, GENMASK(2, 0), 0, 0);
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   VR_AN_INTR_MSK, GENMASK(2, 0), 0, 0x7);
+		/*disable an exten np */
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+				   MDIO_CTRL1, GENMASK(13, 13), 13, 0);
+	}
+
+	phy_info->an_sucess = false;
+	return 0;
+}
+
+static int dn200_phy_an_config(struct dn200_phy_info *phy_info)
+{
+	int ret;
+
+	if (phy_info->phydev) {
+		dn200_linkmode_and(phy_info->phydev->advertising,
+				   phy_info->phydev->supported,
+				   phy_info->lks.link_modes.advertising);
+		phy_info->phydev->autoneg = phy_info->an;
+
+		if (phy_info->speed == SPEED_1000 &&
+			phy_info->dup != DN200_DUP_FULL)
+			return -EINVAL;
+
+		if (phy_info->an != AUTONEG_ENABLE) {
+			phy_info->phydev->speed = phy_info->speed;
+			phy_info->phydev->duplex = phy_info->dup;
+			if (phy_info->speed == SPEED_1000) {
+				clear_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+					phy_info->phydev->advertising);
+				clear_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+					phy_info->phydev->advertising);
+				clear_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+					phy_info->phydev->advertising);
+				clear_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+					phy_info->phydev->advertising);
+				clear_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+					phy_info->phydev->advertising);
+				set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+					phy_info->phydev->advertising);
+				phy_info->phydev->autoneg = AUTONEG_ENABLE;
+			}
+		}
+
+		ret = phy_start_aneg(phy_info->phydev);
+		phy_info->self_adap_reset = true;
+		return ret;
+	}
+	return dn200_xpcs_an_config(phy_info);
+}
+
+static int dn200_phy_identity(struct dn200_phy_info *phy_info)
+{
+	int err = 0;
+
+	if (phy_info->phydev)
+		err = dn200_extern_phy_identity(phy_info);
+	if (phy_info->media_type == DN200_MEDIA_TYPE_XPCS_1000BASEX ||
+	    phy_info->media_type == DN200_MEDIA_TYPE_XPCS_10GBASEKR) {
+		if (!phy_info->xpcs_sfp_valid)
+			return err;
+		clear_bit(DN200_PHY_SFP_INITED, &phy_info->phy_state);
+		err = dn200_sfp_module_identify(phy_info);
+		if (err)
+			return err;
+		dn200_phy_sfp_detect(phy_info);
+		phy_info->blk_err_ck = false;
+		phy_info->blk_err_cnt = 0;
+		if (phy_info->sfp_speed == DN200_SFP_SPEED_UNKNOWN)
+			/*sfp speed get failed, try once */
+			dn200_phy_sfp_detect(phy_info);
+		if (phy_info->sfp_changed) {
+			/*sfp module has been changed, change current speed*/
+			if (phy_info->sfp_speed & DN200_SFP_SPEED_10000 && phy_info->setting_speed == SPEED_10000)
+				dn200_speed_set(phy_info, SPEED_10000);
+			if (phy_info->sfp_speed == DN200_SFP_SPEED_1000)
+				dn200_speed_set(phy_info, SPEED_1000);
+			phy_info->sfp_changed = false;
+		}
+		return 1;
+	}
+	return err;
+}
+
+static int dn200_phy_media_type(struct dn200_phy_info *phy_info)
+{
+	switch (phy_info->phy_interface) {
+		/*extern 1G copper */
+	case PHY_INTERFACE_MODE_RGMII_ID:
+		phy_info->media_type = DN200_MEDIA_TYPE_PHY_COPPER;
+		phy_info->phydev_mode = DN200_MDIO_MODE_CL22;
+		break;
+		/*XPCS 10G fibre */
+	case PHY_INTERFACE_MODE_XGMII:
+		if (phy_info->setting_speed == SPEED_1000)
+			phy_info->media_type = DN200_MEDIA_TYPE_XPCS_1000BASEX;
+		else
+			phy_info->media_type = DN200_MEDIA_TYPE_XPCS_10GBASEKR;
+		break;
+		/*XPCS 1G fibre */
+	case PHY_INTERFACE_MODE_GMII:
+		phy_info->media_type = DN200_MEDIA_TYPE_XPCS_1000BASEX;
+		break;
+		/*extern 1G fibre */
+	case PHY_INTERFACE_MODE_1000BASEX:
+		phy_info->media_type = DN200_MEDIA_TYPE_PHY_1000BASEX;
+		phy_info->phydev_mode = DN200_MDIO_MODE_CL22;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static int dn200_phy_set_link_mode(struct dn200_phy_info *phy_info,
+				   enum dn200_media_type link_mode)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	DN200_ZERO_SUP(lks);
+	/*Set phy supported features */
+	switch (link_mode) {
+	case DN200_MEDIA_TYPE_PHY_COPPER:
+		DN200_SET_SUP(lks, TP);
+		DN200_SET_SUP(lks, 10baseT_Full);
+		DN200_SET_SUP(lks, 10baseT_Half);
+		DN200_SET_SUP(lks, 100baseT_Full);
+		DN200_SET_SUP(lks, 100baseT_Half);
+		DN200_SET_SUP(lks, 1000baseT_Full);
+		DN200_SET_SUP(lks, Autoneg);
+		DN200_SET_SUP(lks, Pause);
+		phy_info->port_type = PORT_TP;
+		if (phy_info->setting_speed <= 0) {
+			phy_info->setting_speed = SPEED_1000;
+			priv->flow_ctrl_an = true;
+		}
+		break;
+	case DN200_MEDIA_TYPE_XPCS_1000BASEX:
+	case DN200_MEDIA_TYPE_PHY_1000BASEX:
+		phy_info->pause &= ~MLO_PAUSE_AN;
+		phy_info->an = phy_info->cur_an;
+		if (phy_info->setting_speed <= 0) {
+			phy_info->setting_speed = SPEED_1000;
+			phy_info->speed = SPEED_1000;
+		}
+		if (phy_info->pause) {
+			DN200_SET_SUP(lks, Pause);
+			DN200_SET_SUP(lks, Asym_Pause);
+		}
+		if (phy_info->an)
+			DN200_SET_SUP(lks, Autoneg);
+		phy_info->port_type = PORT_FIBRE;
+		phy_info->dup = DN200_DUP_FULL;
+		break;
+	case DN200_MEDIA_TYPE_XPCS_10GBASEKR:
+		DN200_SET_SUP(lks, FIBRE);
+		phy_info->port_type = PORT_FIBRE;
+		phy_info->an = phy_info->cur_an;
+		if (phy_info->setting_speed <= 0) {
+			phy_info->setting_speed = SPEED_10000;
+			phy_info->speed = SPEED_10000;
+		}
+		if (phy_info->an)
+			DN200_SET_SUP(lks, Autoneg);
+		phy_info->pause &= ~MLO_PAUSE_AN;
+		if (phy_info->pause & (MLO_PAUSE_TX | MLO_PAUSE_RX)) {
+			DN200_SET_SUP(lks, Pause);
+			DN200_SET_SUP(lks, Asym_Pause);
+		}
+		phy_info->dup = DN200_DUP_FULL;
+		break;
+	default:
+		break;
+	}
+	if (!priv->mii || bitmap_empty(lks->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS))
+		DN200_LM_COPY(lks, advertising, lks, supported);
+	return 0;
+}
+
+#define XPCS0_CR_REG_BASE	(0x40100)
+#define XPCS0_CR_CTRL		0x0
+#define XPCS0_CR_ADDR		0x4
+#define XPCS0_CR_DATA		0x8
+static int _phy_reg_write(struct dn200_phy_info *phy_info, u16 phy_reg_addr,
+			  u16 reg_val)
+{
+	int ret = 0;
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dn200_xpcs_lock, flags);
+	ret =
+	    readl_poll_timeout_atomic(phy_info->xpcs->xpcs_regs_base +
+			       XPCS0_CR_REG_BASE + XPCS0_CR_CTRL, val,
+			       !(val & BIT(0)), 1, 10000);
+	if (ret)
+		goto out;
+
+	writel((u32) phy_reg_addr,
+	       phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+	       XPCS0_CR_ADDR);
+	writel((u32) reg_val,
+	       phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+	       XPCS0_CR_DATA);
+	writel(0x3,
+	       phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+	       XPCS0_CR_CTRL);
+
+out:
+	spin_unlock_irqrestore(&dn200_xpcs_lock, flags);
+	return ret;
+}
+
+/* Used to validate xpcs phy fw write function. */
+static int  _phy_reg_read(struct dn200_phy_info *phy_info,
+					u16 phy_reg_addr, u16 *reg_val)
+{
+	int ret = 0;
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dn200_xpcs_lock, flags);
+
+	ret =
+	    readl_poll_timeout_atomic(phy_info->xpcs->xpcs_regs_base +
+			       XPCS0_CR_REG_BASE + XPCS0_CR_CTRL, val,
+			       !(val & BIT(0)), 1, 10000);
+	if (ret)
+		goto out;
+
+	writel((u32) phy_reg_addr,
+	       phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+	       XPCS0_CR_ADDR);
+	writel(0x1,
+	       phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+	       XPCS0_CR_CTRL);
+
+	ret =
+	    readl_poll_timeout_atomic(phy_info->xpcs->xpcs_regs_base +
+			       XPCS0_CR_REG_BASE + XPCS0_CR_CTRL, val,
+			       !(val & BIT(0)), 1, 10000);
+	if (ret)
+		goto out;
+
+	*reg_val =
+	    (u16) readl(phy_info->xpcs->xpcs_regs_base + XPCS0_CR_REG_BASE +
+			XPCS0_CR_DATA);
+
+out:
+	spin_unlock_irqrestore(&dn200_xpcs_lock, flags);
+	return ret;
+}
+
+static int dn200_xpcs_switch_to_1G_tx_regset(struct dn200_phy_info *phy_info)
+{
+	u8 phylane_idx = 3 - phy_info->xpcs_idx;
+	u16 reg_val;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	_phy_reg_write(phy_info, 0x2048 + 0x200 * phylane_idx, 0x501f);/*TX*/
+	_phy_reg_write(phy_info, 0x2047 + 0x200 * phylane_idx, 0xe461);/*TX*/
+	_phy_reg_write(phy_info, 0x2046 + 0x200 * phylane_idx, 0x0300);/*TX*/
+	_phy_reg_write(phy_info, 0x2049 + 0x200 * phylane_idx, 0x18);/*TX*/
+	_phy_reg_write(phy_info, 0x204a + 0x200 * phylane_idx, 0x0000);/*TX*/
+	_phy_reg_write(phy_info, 0x100c + 0x200 * phylane_idx, 0x0180);/*TX*/
+	_phy_reg_write(phy_info, 0x204c + 0x200 * phylane_idx, 0x7e28);/*TX*/
+	_phy_reg_write(phy_info, 0x204d + 0x200 * phylane_idx, 0x4);/*TX*/
+	_phy_reg_write(phy_info, 0x1003 + 0x200 * phylane_idx, 0x0588);/*TX*/
+	_phy_reg_write(phy_info, 0x1004 + 0x200 * phylane_idx, 0x2040);/*TX*/
+	_phy_reg_write(phy_info, 0x1002 + 0x200 * phylane_idx, 0x009c);/*TX*/
+	if (priv->plat_ex->raid_supported)
+		_phy_reg_write(phy_info, 0x1152 + 0x200 * phylane_idx, 0x80a0);/*TX*/
+	/* Config PHY: choose context and set Tx width */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_TXCNTX_CTRL0, 0x0808);/*TX*/
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_TXCMCNTX_SEL, 0x0404);/*TX*/
+
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TXWIDTH_CTRL, 0x1111);/*TX*/
+
+		/* Select KX Mode */
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+			    MDIO_CTRL2);
+	reg_val &= ~GENMASK(3, 0);
+	reg_val |= 0x1 & GENMASK(3, 0);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PCS, MDIO_CTRL2,
+			 reg_val);/*TX*/
+
+	/* Clear USXG_EN and EN_2_5G_MODE */
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+			    VR_XS_PCS_DIG_CTRL1);
+	reg_val &= ~(XPCS_PCS_BYP_PWRUP_DUP1 | XPCS_EN_2_5G_MODE);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+			 VR_XS_PCS_DIG_CTRL1, reg_val);/*TX*/
+
+	/* Select 1G mode */
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			    MDIO_CTRL1);
+	reg_val &= ~BIT(13);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 MDIO_CTRL1, reg_val);/*TX*/
+
+	/*TX eq override */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TX_EQ_CTRL0, 0x1800);/*TX*/
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TX_EQ_CTRL1, 0x40);/*TX*/
+	/* Change xge top clock mux */
+	writel(0x0,
+		phy_info->xpcs->xpcs_regs_base +
+		XGE_XGMAC_CLK_MUX_CTRL(phy_info->xpcs_idx));
+	return 0;
+}
+
+static int dn200_xpcs_switch_to_1G_rx_regset(struct dn200_phy_info *phy_info)
+{
+	u8 phylane_idx = 3 - phy_info->xpcs_idx;
+
+	_phy_reg_write(phy_info, 0x2005 + 0x200 * phylane_idx, 0x060c);	/*rx ovrd */
+	_phy_reg_write(phy_info, 0x2052 + 0x200 * phylane_idx, 0x0550);/*RX*/
+	_phy_reg_write(phy_info, 0x2053 + 0x200 * phylane_idx, 0x4052);/*RX*/
+	_phy_reg_write(phy_info, 0x2054 + 0x200 * phylane_idx, 0x2200);/*RX*/
+	_phy_reg_write(phy_info, 0x204f + 0x200 * phylane_idx, 0x3014);/*RX*/
+	_phy_reg_write(phy_info, 0x2050 + 0x200 * phylane_idx, 0x7514);/*RX*/
+	_phy_reg_write(phy_info, 0x2056 + 0x200 * phylane_idx, 0xd841);/*RX*/
+	_phy_reg_write(phy_info, 0x103d + 0x200 * phylane_idx, 0x0180);/*RX*/
+	_phy_reg_write(phy_info, 0x2059 + 0x200 * phylane_idx, 0x656d);/*RX*/
+	_phy_reg_write(phy_info, 0x2051 + 0x200 * phylane_idx, 0x0403);/*RX*/
+	_phy_reg_write(phy_info, 0x205a + 0x200 * phylane_idx, 0x2021);/*RX*/
+	_phy_reg_write(phy_info, 0x1021 + 0x200 * phylane_idx, 0x9c00);/*RX*/
+	_phy_reg_write(phy_info, 0x200c + 0x200 * phylane_idx, 0x0010);/*RX*/
+	_phy_reg_write(phy_info, 0x20f9 + 0x200 * phylane_idx, 0x01f4);/*TX*/
+	_phy_reg_write(phy_info, 0x1022 + 0x200 * phylane_idx, 0x0550);
+	/* Config PHY: choose context and set Tx/Rx width */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_RXCNTX_CTRL0, 0x0808);/*RX*/
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_RXWIDTH_CTRL, 0x1111);/*RX*/
+	return 0;
+}
+
+static int dn200_xpcs_switch_to_10G_rx_regset(struct dn200_phy_info *phy_info)
+{
+	u8 phylane_idx = 3 - phy_info->xpcs_idx;
+
+	_phy_reg_write(phy_info, 0x2005 + 0x200 * phylane_idx, 0x18);
+	_phy_reg_write(phy_info, 0x2053 + 0x200 * phylane_idx, 0x4013);
+	_phy_reg_write(phy_info, 0x204f + 0x200 * phylane_idx, 0x4110);
+	_phy_reg_write(phy_info, 0x2050 + 0x200 * phylane_idx, 0x6514);
+	_phy_reg_write(phy_info, 0x2056 + 0x200 * phylane_idx, 0xd841);
+	_phy_reg_write(phy_info, 0x103d + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x2059 + 0x200 * phylane_idx, 0xffff);
+	_phy_reg_write(phy_info, 0x205a + 0x200 * phylane_idx, 0xff7f);
+	_phy_reg_write(phy_info, 0x2051 + 0x200 * phylane_idx, 0x6403);
+	_phy_reg_write(phy_info, 0x200c + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x20f9 + 0x200 * phylane_idx, 0x01e0);
+	/* rx_vco_ld_val=1650&&rx_ref_ld_val=20=>rx_clk */
+	_phy_reg_write(phy_info, 0x1021 + 0x200 * phylane_idx, 0x0094);
+	_phy_reg_write(phy_info, 0x1022 + 0x200 * phylane_idx, 0x2672);
+	_phy_reg_write(phy_info, 0x2052 + 0x200 * phylane_idx, 0x2672);
+	_phy_reg_write(phy_info, 0x2054 + 0x200 * phylane_idx, 0x280c);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_RXCNTX_CTRL0, 0x0d0d);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_RXWIDTH_CTRL, 0x4444);
+
+	return 0;
+}
+
+static int dn200_xpcs_switch_to_10G_tx_regset(struct dn200_phy_info *phy_info)
+{
+	u8 phylane_idx = 3 - phy_info->xpcs_idx;
+	u16 reg_val;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	_phy_reg_write(phy_info, 0x2048 + 0x200 * phylane_idx, 0x4595);
+	_phy_reg_write(phy_info, 0x2047 + 0x200 * phylane_idx, 0xe7cf);
+	_phy_reg_write(phy_info, 0x2046 + 0x200 * phylane_idx, 0x1288);
+	_phy_reg_write(phy_info, 0x2049 + 0x200 * phylane_idx, 0x4110);
+	_phy_reg_write(phy_info, 0x204a + 0x200 * phylane_idx, 0x6637);
+	_phy_reg_write(phy_info, 0x100c + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x204c + 0x200 * phylane_idx, 0xfffe);
+	_phy_reg_write(phy_info, 0x204d + 0x200 * phylane_idx, 0x0fff);
+	_phy_reg_write(phy_info, 0x1003 + 0x200 * phylane_idx, 0x0180);
+	_phy_reg_write(phy_info, 0x1004 + 0x200 * phylane_idx, 0x0000);
+	// _phy_reg_write(phy_info, 0x1165 + 0x200 * phylane_idx, 0x10D);
+	_phy_reg_write(phy_info, 0x1002 + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x2052 + 0x200 * phylane_idx, 0x257b);
+	_phy_reg_write(phy_info, 0x2053 + 0x200 * phylane_idx, 0x4013);
+	_phy_reg_write(phy_info, 0x2054 + 0x200 * phylane_idx, 0x220c);
+	_phy_reg_write(phy_info, 0x204f + 0x200 * phylane_idx, 0x4110);
+	_phy_reg_write(phy_info, 0x2050 + 0x200 * phylane_idx, 0x6514);
+	_phy_reg_write(phy_info, 0x2056 + 0x200 * phylane_idx, 0xd841);
+	_phy_reg_write(phy_info, 0x103d + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x2059 + 0x200 * phylane_idx, 0xffff);
+	_phy_reg_write(phy_info, 0x205a + 0x200 * phylane_idx, 0xff7f);
+	_phy_reg_write(phy_info, 0x2051 + 0x200 * phylane_idx, 0x6403);
+	_phy_reg_write(phy_info, 0x1021 + 0x200 * phylane_idx, 0x0014);
+	_phy_reg_write(phy_info, 0x200c + 0x200 * phylane_idx, 0x0000);
+	_phy_reg_write(phy_info, 0x20f9 + 0x200 * phylane_idx, 0x01e0);
+	if (priv->plat_ex->raid_supported)
+		_phy_reg_write(phy_info, 0x1152 + 0x200 * phylane_idx, 0x80a1);
+	/* Deselect KX Mode */
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+			    MDIO_CTRL2);
+	reg_val &= ~GENMASK(3, 0);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PCS, MDIO_CTRL2,
+			 reg_val);
+
+	/* Deselect 1G mode */
+	reg_val =
+	    dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			    MDIO_CTRL1);
+	reg_val |= BIT(13);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 MDIO_CTRL1, reg_val);
+
+	/* Config PHY: choose context and set Tx/Rx width */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_TXCNTX_CTRL0, 0x0d0d);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP32G_TXCMCNTX_SEL, 0x0909);
+
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TXWIDTH_CTRL, 0x4444);
+
+	/*TX eq override */
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TX_EQ_CTRL0, 0xc04);
+	dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PMAPMD,
+			 VR_XS_PMA_MP25G_TX_EQ_CTRL1, 0x40);
+	/* Change xge top clock mux */
+	writel(0x6,
+	       phy_info->xpcs->xpcs_regs_base +
+	       XGE_XGMAC_CLK_MUX_CTRL(phy_info->xpcs_idx));
+
+	return 0;
+}
+static int dn200_phy_find_phy_device(struct dn200_phy_info *phy_info)
+{
+	int ret;
+
+	if (!phy_info->phydev)
+		return -1;
+
+	phy_info->phydev->link = 0;
+	ret =
+	    phy_attach_direct(phy_info->dev, phy_info->phydev,
+			      phy_info->phydev->dev_flags,
+			      phy_info->phy_interface);
+	if (ret) {
+		netdev_err(phy_info->dev, "phy_attach_direct failed %d\n", ret);
+		return ret;
+	}
+
+	phy_start_aneg(phy_info->phydev);
+	return 0;
+}
+
+static void dn200_phy_free_phy_device(struct dn200_phy_info *phy_info)
+{
+	if (phy_info->phydev) {
+		phy_detach(phy_info->phydev);
+		/* for some system suspend warning */
+		phy_info->phydev->state = PHY_READY;
+	}
+}
+
+static int dn200_phy_set_speed(struct dn200_phy_info *phy_info)
+{
+	return 0;		/*FIXME GUOFENG */
+}
+
+static void dn200_phy_timer_stop(struct dn200_phy_info *phy_info)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (phy_info->media_type == DN200_MEDIA_TYPE_PHY_1000BASEX ||
+	    phy_info->media_type == DN200_MEDIA_TYPE_PHY_COPPER) {
+		phy_info->phy_ops->stop_phy_timer(phy_info);
+	} else if (phy_info->media_type == DN200_MEDIA_TYPE_XPCS_1000BASEX ||
+		   phy_info->media_type == DN200_MEDIA_TYPE_XPCS_10GBASEKR) {
+		phy_info->phy_ops->stop_phy_timer(phy_info);
+		if (phy_info->phy_status_work.func)
+			cancel_work_sync(&phy_info->phy_status_work);
+		if (phy_info->kr_train_work.func)
+			cancel_work_sync(&phy_info->kr_train_work);
+		if (phy_info->phy_multispeed_work.work.func)
+			cancel_delayed_work_sync(&phy_info->phy_multispeed_work);
+	}
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state)) {
+		if (phy_info->link_status) {
+			phy_info->link_status = DN200_LINK_DOWN;
+			dn200_phy_print_status(phy_info);
+		}
+		if (PRIV_SRIOV_SUPPORT(priv))
+			dn200_pf_set_link_status(priv->hw, phy_info->link_status);
+	}
+}
+static int dn200_phy_stop(struct dn200_phy_info *phy_info)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (!test_bit(DN200_PHY_STARTED, &phy_info->phy_state))
+		return 0;
+	dn200_phy_timer_stop(phy_info);
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state)) {
+		phy_info->phy_ops->led_control(phy_info, DN200_LINK_DOWN);
+		if (!phy_info->phydev)
+			dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_DISABLE);
+	}
+	if (phy_info->media_type == DN200_MEDIA_TYPE_PHY_1000BASEX ||
+	    phy_info->media_type == DN200_MEDIA_TYPE_PHY_COPPER) {
+		dn200_phy_free_phy_device(phy_info);
+	} else if (phy_info->media_type == DN200_MEDIA_TYPE_XPCS_1000BASEX ||
+		   phy_info->media_type == DN200_MEDIA_TYPE_XPCS_10GBASEKR) {
+		/* Stop the I2C controller */
+		phy_info->sfp_tx_disable = 1;
+		if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+			dn200_phy_set_sfp_tx_disable(phy_info);
+	}
+	clear_bit(DN200_PHY_STARTED, &phy_info->phy_state);
+	if (!test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		fw_link_state_set(&priv->plat_ex->ctrl, DN200_LINK_DOWN, phy_info->dup, phy_info->speed);
+	return 0;
+}
+
+static int dn200_vf_phy_stop(struct dn200_phy_info *phy_info)
+{
+	if (!test_bit(DN200_PHY_STARTED, &phy_info->phy_state))
+		return 0;
+	phy_info->phy_ops->stop_phy_timer(phy_info);
+	if (phy_info->phy_status_work.func)
+		cancel_work_sync(&phy_info->phy_status_work);
+	phy_info->link_status = DN200_LINK_DOWN;
+	netif_carrier_off(phy_info->dev);
+	netif_tx_stop_all_queues(phy_info->dev);
+	clear_bit(DN200_PHY_STARTED, &phy_info->phy_state);
+	dn200_phy_print_status(phy_info);
+	return 0;
+}
+
+static void dn200_sfp_phy_state_init(struct dn200_phy_info *phy_info)
+{
+	bitmap_empty(&phy_info->phy_state, BITS_PER_LONG);
+	/*set sfp reset state to reinit sfp info */
+	set_bit(DN200_PHY_SFP_NEED_RESET, &phy_info->phy_state);
+	dn200_phy_sfp_reset(phy_info);
+}
+
+static int dn200_phy_start(struct dn200_phy_info *phy_info)
+{
+	int ret = 0;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	if (phy_info->media_type == DN200_MEDIA_TYPE_PHY_1000BASEX ||
+	    phy_info->media_type == DN200_MEDIA_TYPE_PHY_COPPER) {
+		ret = dn200_phy_find_phy_device(phy_info);
+		if (ret) {
+			netdev_err(phy_info->dev,
+				   "dn200_phy_find_phy_device failed %d\n",
+				   ret);
+			return ret;
+		}
+
+		extern_phy_init(phy_info->phydev, priv->plat_ex->hw_rj45_type);
+		phy_info->phydev->autoneg = phy_info->an;
+		if (phy_info->an != AUTONEG_ENABLE) {
+			phy_info->phydev->speed = phy_info->speed;
+			phy_info->phydev->duplex = phy_info->dup;
+		}
+
+		phy_info->phy_ops->start_phy_timer(phy_info);
+	} else if (phy_info->media_type == DN200_MEDIA_TYPE_XPCS_1000BASEX ||
+		   phy_info->media_type == DN200_MEDIA_TYPE_XPCS_10GBASEKR) {
+		/*reinit phy state */
+		dn200_sfp_phy_state_init(phy_info);
+		/* Start the I2C controller */
+		phy_info->sfp_tx_disable = 0;
+		dn200_phy_set_sfp_tx_disable(phy_info);
+		phy_info->speed_reset_time = jiffies + DN200_SFP_RESET_TIME;
+		phy_info->phy_ops->start_phy_timer(phy_info);
+		usleep_range(10000, 20000);
+		dn200_phy_read(phy_info, MDIO_MMD_PCS, MDIO_STAT1);
+		phy_info->sfp_rx_los = 1;
+		phy_info->recfg_an = true;
+	}
+	if (phy_info->phydev && phy_info->an)
+		dn200_phy_an_config(phy_info);
+	set_bit(DN200_PHY_STARTED, &phy_info->phy_state);
+	phy_info->phy_status_time_intr = DN200_PHY_STATUS_DINTR;
+	return ret;
+}
+
+static int dn200_vf_phy_start(struct dn200_phy_info *phy_info)
+{
+	phy_info->phy_ops->start_phy_timer(phy_info);
+	set_bit(DN200_PHY_STARTED, &phy_info->phy_state);
+	phy_info->phy_status_time_intr = DN200_PHY_STATUS_DINTR;
+	phy_info->phy_ops->link_status(phy_info);
+	return 0;
+}
+
+static int __maybe_unused dn200_phy_best_advertised_speed(struct dn200_phy_info *phy_info)
+{
+	switch (phy_info->phy_interface) {
+		/*extern 1G copper */
+	case PHY_INTERFACE_MODE_RGMII_ID:
+		return SPEED_1000;
+		/*XPCS 10G fibre */
+	case PHY_INTERFACE_MODE_XGMII:
+		return SPEED_10000;
+		/*XPCS 1G fibre */
+	case PHY_INTERFACE_MODE_GMII:
+		return SPEED_1000;
+		/*extern 1G fibre */
+	case PHY_INTERFACE_MODE_1000BASEX:
+		return SPEED_1000;
+	default:
+		return SPEED_UNKNOWN;
+	}
+	return SPEED_UNKNOWN;
+}
+
+static int dn200_phy_init(struct dn200_phy_info *phy_info)
+{
+	struct ethtool_link_ksettings *lks = &phy_info->lks;
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+	/*disable an */
+	if (!phy_info->phydev) {
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			   MDIO_CTRL1, AN_CTRL_AN_EN, AN_CTRL_AN_EN_S, 0);
+		usleep_range(10000, 15000);
+	}
+	/*Get Current media type */
+	phy_info->phy_ops->media_type_get(phy_info);
+	dn200_phy_set_link_mode(phy_info, phy_info->media_type);
+	phy_info->phy_ops->init_phy_timer(phy_info);
+	if (!phy_info->phydev) {
+		if (!phy_info->self_adap_reset)
+			phy_info->speed = phy_info->setting_speed;
+		if (phy_info->speed == SPEED_1000) {
+			phy_info->mac_ops->mac_speed_set(priv,
+							 phy_info->phy_interface,
+							 phy_info->speed);
+			if (phy_info->last_link_speed != phy_info->speed)
+				dn200_xpcs_switch_to_1G(phy_info);
+			if (netif_msg_ifup(priv))
+				netdev_info(phy_info->dev,
+					    "PHY %d initialized as 1G speed mode\n",
+					    phy_info->xpcs_idx);
+		}
+		if (phy_info->speed == SPEED_10000) {
+			/* we shoudl call dn200_xpcs_switch_to_10G rather than
+			 * dn200_xpcs_init_as_10G because when set 1G speed last time,
+			 * now we want 10G speed.
+			 * if we call dn200_xpcs_init_as_10G,maybe we can't
+			 * switch successfully.
+			 */
+			phy_info->mac_ops->mac_speed_set(priv,
+							 phy_info->phy_interface,
+							 phy_info->speed);
+			if (phy_info->last_link_speed != phy_info->speed)
+				dn200_xpcs_switch_to_10G(phy_info);
+			if (netif_msg_ifup(priv))
+				netdev_info(phy_info->dev,
+					    "PHY %d initialized as 10GBASE-KR\n",
+					    phy_info->xpcs_idx);
+		}
+		phy_info->last_link_speed = phy_info->speed;
+		phy_info->self_adap_reset = false;
+		clear_bit(DN200_PHY_IN_RESET, &phy_info->phy_state);
+	}
+	if (DN200_ADV(lks, Autoneg)) {
+		phy_info->an = AUTONEG_ENABLE;
+		if (phy_info->port_type != PORT_FIBRE) {
+			phy_info->speed = SPEED_UNKNOWN;
+			phy_info->dup = DUPLEX_UNKNOWN;
+		}
+	} else {
+		phy_info->an = AUTONEG_DISABLE;
+	//	phy_info->max_speed = dn200_phy_best_advertised_speed(phy_info);
+		phy_info->dup = DUPLEX_FULL;
+	}
+	return 0;
+}
+
+static int dn200_vf_phy_init(struct dn200_phy_info *phy_info)
+{
+	phy_info->phy_ops->init_phy_timer(phy_info);
+	return 0;
+}
+
+static int dn200_phy_reset(struct dn200_phy_info *phy_info)
+{
+	u16 reg_val = 0;
+
+	if (phy_info->phydev) {
+		genphy_soft_reset(phy_info->phydev);
+		return phy_init_hw(phy_info->phydev);
+	}
+
+	reg_val =
+		dn200_xpcs_read(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+				MDIO_CTRL1);
+	reg_val |= MDIO_CTRL1_RESET;
+	reg_val =
+		dn200_xpcs_write(phy_info, phy_info->phy_addr, MDIO_MMD_PCS,
+					MDIO_CTRL1, reg_val);
+	return dn200_phy_init(phy_info);
+}
+
+const struct dn200_phy_ops dn200_phy_ops_info = {
+	.media_type_get = dn200_phy_media_type,
+	.identity = dn200_phy_identity,
+	.link_status = dn200_phy_status,
+	.read_i2c_byte = dn200_generic_i2c_byte_read,
+	.write_i2c_byte = dn200_generic_i2c_byte_write,
+	.read_i2c_eeprom = dn200_i2c_read_eeprom,
+	.read_i2c_sff8472 = dn200_i2c_read_sff8472,
+	.write_i2c_eeprom = dn200_i2c_write_eeprom,
+	.init = dn200_phy_init,
+	.an_config = dn200_phy_an_config,
+	.start = dn200_phy_start,
+	.stop = dn200_phy_stop,
+	.reset = dn200_phy_reset,
+	.set_speeds = dn200_phy_set_speed,
+	.led_control = dn200_phy_set_led,
+	.blink_control = dn200_blink_control,
+	.init_phy_timer = dn200_init_phy_status_timers,
+	.start_phy_timer = dn200_start_phy_status_timers,
+	.stop_phy_timer = dn200_stop_phy_status_timers,
+	.phy_timer_del = dn200_phy_timer_stop,
+	.get_link_ksettings = dn200_get_link_ksettings,
+	.set_link_ksettings = dn200_set_link_ksettings,
+	.init_eee = dn200_phy_init_eee,
+	.set_eee = dn200_ethtool_set_eee,
+	.get_eee = dn200_ethtool_get_eee,
+	.get_phy_pauseparam = dn200_get_phy_pauseparam,
+	.set_phy_pauseparam = dn200_set_phy_pauseparam,
+	.nway_reset = dn200_nway_reset,
+	.phy_loopback = dn200_phy_loopback,
+};
+
+const struct dn200_phy_ops dn200_vf_phy_ops_info = {
+	.link_status = dn200_vf_phy_status,
+	.init = dn200_vf_phy_init,
+	.start = dn200_vf_phy_start,
+	.stop = dn200_vf_phy_stop,
+	.init_phy_timer = dn200_init_phy_status_timers,
+	.start_phy_timer = dn200_start_phy_status_timers,
+	.stop_phy_timer = dn200_stop_phy_status_timers,
+	.get_link_ksettings = dn200_get_link_ksettings,
+	.set_link_ksettings = dn200_set_link_ksettings,
+	.init_eee = dn200_phy_init_eee,
+	.set_eee = dn200_ethtool_set_eee,
+	.get_eee = dn200_ethtool_get_eee,
+	.get_phy_pauseparam = dn200_get_phy_pauseparam,
+};
+
+/* dn200_hw_sideband_init :
+ * init phy sideband signal
+ */
+void dn200_hw_sideband_init(struct dn200_phy_info *phy_info)
+{
+	struct dn200_priv *priv = netdev_priv(phy_info->dev);
+
+	dn200_phy_set_led(phy_info, 0);
+	if (phy_info->phydev) {
+		extern_phy_force_led(phy_info->phydev, priv, 0, BLINK_DISABLE);
+		return;
+	}
+	dn200_led_blink_ctrl(&priv->plat_ex->ctrl, BLINK_DISABLE);
+	phy_info->sfp_tx_disable = 1;
+	dn200_phy_set_sfp_tx_disable(phy_info); /*disable tx */
+	dn200_phy_sfp_present(phy_info);
+}
+
+static bool dn200_check_i2c_supported(struct dn200_priv *priv)
+{
+	struct pci_dev *pdev = to_pci_dev(priv->device);
+
+	switch (pdev->device) {
+	case DN200_DEV_ID_SFP_10G_2P_PURE_PF:
+	case DN200_DEV_ID_SFP_10G_2P_SRIOV_PF:
+	case DN200_DEV_ID_SFP_10G_2P_NVME_PUREPF:
+	case DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF:
+	case DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_PF:
+		return true;
+	default:
+		/*todo: other deviceid also need to support i2c */
+		break;
+	}
+	return false;
+}
+
+/**
+ * dn200_phy_info_init - phy info init
+ * @dev: device pointer
+ *
+ * Description: init phy info struct
+ *
+ * Returns 0 on success, negative value on error
+ */
+
+int dn200_phy_info_init(struct net_device *dev,
+			const struct dn200_mac_ops *mac_ops)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct dn200_phy_info *phy_info;
+	struct dn200_xpcs_info *xpcs;
+	struct phy_device *phydev;
+	int ret = 0;
+	u32 val;
+
+	/* Some DT bindings do not set-up the PHY handle. Let's try to
+	 * manually parse it
+	 */
+	int addr = priv->plat->phy_addr;
+
+	phy_info = devm_kzalloc(priv->device, sizeof(*phy_info), GFP_KERNEL);
+	if (!phy_info)
+		return -ENOMEM;
+	priv->plat_ex->phy_info = phy_info;
+	phy_info->mac_ops = mac_ops;
+	phy_info->dev = dev;
+	phy_info->phy_interface = priv->plat->phy_interface;
+	phy_info->phy_addr = addr;
+	phy_info->xpcs_idx = priv->plat_ex->xpcs_index;
+	phy_info->pause = priv->flow_ctrl;
+	phy_info->cur_an = AUTONEG_ENABLE;
+	/* Create workqueues */
+	phy_info->dev_workqueue =
+	    create_singlethread_workqueue(netdev_name(dev));
+	if (!phy_info->dev_workqueue) {
+		netdev_err(dev, "device workqueue creation failed\n");
+		ret = -ENOMEM;
+		goto phy_info_free;
+	}
+
+//	phy_info->dev_workqueue = priv->wq;
+	INIT_WORK(&phy_info->phy_status_work, dn200_phy_status_service);
+	INIT_WORK(&phy_info->kr_train_work, dn200_kr_train_work);
+	if (PRIV_IS_VF(priv)) {
+		phy_info->phy_ops = &dn200_vf_phy_ops_info;
+		return 0;
+	}
+	phy_info->phy_ops = &dn200_phy_ops_info;
+
+	if (priv->plat_ex->has_xpcs)
+		goto xpcs_init;
+	phydev = mdiobus_get_phy(priv->mii, addr);
+	if (!phydev) {
+		netdev_err(dev, "phydev get failed\n");
+		ret = -ENODEV;
+		goto phy_info_free;
+	}
+
+	ret = get_rj45_type(&priv->plat_ex->ctrl, &val);
+	priv->plat_ex->hw_rj45_type = val;
+	phy_info->mii_bus = priv->mii;
+	phy_info->phydev = phydev;
+	extern_phy_init(phydev, priv->plat_ex->hw_rj45_type);
+	return ret;
+
+xpcs_init:
+
+	xpcs =
+	    devm_kzalloc(priv->device, sizeof(struct dn200_xpcs_info),
+			 GFP_KERNEL);
+	if (!xpcs) {
+		ret = -ENOMEM;
+		netdev_err(dev, "Alloc phy_xpcs memory failed!\n");
+		goto phy_info_free;
+	}
+	INIT_DELAYED_WORK(&phy_info->phy_multispeed_work,
+			  dn200_phy_multispeed_service);
+	xpcs->xpcs_read = dn200_xpcs_read;
+	xpcs->xpcs_write = dn200_xpcs_write;
+	xpcs->xpcs_regs_base = priv->ioaddr;
+
+	phy_info->xpcs = xpcs;
+	phy_info->sfp_has_gpio = true;
+
+	phy_info->xpcs_sfp_valid = dn200_check_i2c_supported(priv);
+
+	phy_info->gpio_data = priv->plat_ex->gpio_data;
+	phy_info->gpio_base =
+	    priv->ioaddr + phy_info->gpio_data->gpio_addr_offset;
+	dn200_hw_sideband_init(phy_info);
+	dn200_conf_pcie_common_para(phy_info);
+	phy_info->max_speed = priv->plat_ex->max_speed;
+	dn200_phy_fec_enable(dev, false);
+	return 0;
+
+phy_info_free:
+	devm_kfree(priv->device, phy_info);
+	return ret;
+}
+
+int dn200_phy_info_remove(struct net_device *dev)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+	u8 phylane_idx = 3 - phy_info->xpcs_idx;
+
+	if (phy_info && phy_info->phy_ops && phy_info->phy_ops->stop)
+		phy_info->phy_ops->stop(phy_info);
+	if (phy_info->xpcs) {
+		if (!PRIV_IS_VF(priv)) {
+			/* recover the config for old driver */
+			_phy_reg_write(phy_info, 0x1022 + 0x200 * phylane_idx, 0x57b);
+		}
+		devm_kfree(priv->device, phy_info->xpcs);
+	}
+	if (phy_info->phy_status_work.func)
+		cancel_work_sync(&phy_info->phy_status_work);
+	if (phy_info->phy_multispeed_work.work.func)
+		cancel_delayed_work_sync(&phy_info->phy_multispeed_work);
+	destroy_workqueue(phy_info->dev_workqueue);
+	devm_kfree(priv->device, phy_info);
+	return 0;
+}
+
+int dn200_phy_fec_enable(struct net_device *dev, bool enable)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	struct dn200_phy_info *phy_info = PRIV_PHY_INFO(priv);
+
+	if (phy_info->phydev)
+		return -EOPNOTSUPP;
+
+	if (enable)
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			SR_AN_ADV3, GENMASK(15, 14), 14, 0x3);
+	else
+		dn200_xpcs_set_bit(phy_info, phy_info->phy_addr, MDIO_MMD_AN,
+			SR_AN_ADV3, GENMASK(15, 14), 14, 0);
+
+	if (netif_running(dev) &&
+		phy_info->link_status == DN200_LINK_UP &&
+		phy_info->speed == SPEED_10000) {
+		dn200_phy_stop(phy_info);
+		usleep_range(1000, 2000);
+		dn200_phy_start(phy_info);
+		return dn200_nway_reset(phy_info);
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_pool.c b/drivers/net/ethernet/dapustor/dn200/dn200_pool.c
new file mode 100644
index 000000000000..0c0d292f3fbe
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_pool.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config dn200 ethernet buf pool
+ */
+
+#include "dn200.h"
+#include "common.h"
+#include "dn200_pool.h"
+#include "dn200_iatu.h"
+#include "dn200_reg.h"
+
+#define DN200_PAGE_ORDER (MAX_ORDER - 1)
+#define DN200_RING_SIZE_512 512
+
+bool dma_can_direct_use(struct dn200_priv *priv, dma_addr_t dma_addr)
+{
+	int addr_bits_limit = priv->plat_ex->addr_bits_limit;
+	u64 addr_forbid_bits = priv->plat_ex->addr_forbid_bits;
+
+	/* vf must use iatu and can't direct use the dma */
+	if (PRIV_IS_VF(priv))
+		return false;
+
+	/* if raid not exist:
+	 * 1. for pf: we can use the dma when it is within the addr limit and not be forbidden
+	 * 2. for vf: just can support dma32
+	 */
+	if (!priv->plat_ex->raid_supported) {
+		/* dma address longer than addr bits limit (e.g. 40bits for pf or 32bits for vf) */
+		if ((dma_addr >> addr_bits_limit) > 0)
+			return false;
+
+		/* dma address include all forbidden bits */
+		if (addr_forbid_bits != 0 &&
+		    ((dma_addr & addr_forbid_bits) == addr_forbid_bits)) {
+			return false;
+		}
+		return true;
+	}
+
+	/* if raid exist, all dma address can't be directly used */
+	return false;
+}
+
+static unsigned int dn200_page_pool_size_get(struct dn200_priv *priv)
+{
+	unsigned int pool_size = 0;
+	unsigned int ring_size = priv->dma_rx_size;
+	int muiltple_num = 4;
+	unsigned int buf_ring_sz = 0;
+
+	if ((PAGE_SIZE >> DN200_DEFAULT_PAGE_SIZE_SHIFT) > 8)
+		muiltple_num *= 4;
+	else if ((PAGE_SIZE >> DN200_DEFAULT_PAGE_SIZE_SHIFT) > 4)
+		muiltple_num *= 3;
+	else if ((PAGE_SIZE >> DN200_DEFAULT_PAGE_SIZE_SHIFT) > 2)
+		muiltple_num *= 2;
+	if (ring_size <= DN200_RING_SIZE_512)
+		ring_size = DN200_RING_SIZE_512;
+
+	buf_ring_sz = priv->plat->rx_queues_to_use * ring_size * muiltple_num;
+	if (buf_ring_sz > DN200_MT_32K_MAX_BUF_SIZE)
+		buf_ring_sz = DN200_MT_32K_MAX_BUF_SIZE;
+	pool_size = buf_ring_sz * DN200_RX_BUF_SIZE;
+
+	return pool_size;
+}
+
+static inline struct page *dn200_alloc_page(struct dn200_priv *priv,
+					    dma_addr_t *dma_addr,
+					    dma_addr_t *base_addr)
+{
+	struct page *page = NULL;
+	dma_addr_t addr = 0;
+	gfp_t gfp_mask;
+
+	enum dma_data_direction dma_dir = DMA_FROM_DEVICE;
+
+	/*1. pure & sriov pf: use normal memory to alloc page first, and then check the dma addr
+	 *2. vf: don't use normal memory to alloc, use dma32 directly
+	 */
+	gfp_mask = GFP_ATOMIC | __GFP_NOWARN | __GFP_COMP  |
+			     __GFP_MEMALLOC;
+	if (priv->plat->addr64 == 32)
+		gfp_mask |= GFP_DMA32;
+
+	page =
+	    alloc_pages_node(priv->numa_node,
+			     gfp_mask, dn200_rx_pg_order_get(priv));
+	if (!page) {
+		dev_err(priv->device, "no page memory: %s, %d.\n",
+			__func__, __LINE__);
+		return NULL;
+	}
+
+	addr =
+	    dma_map_page_attrs(priv->device, page, 0,
+			       dn200_rx_pg_size_get(priv), dma_dir,
+			       (DMA_ATTR_SKIP_CPU_SYNC |
+				DMA_ATTR_WEAK_ORDERING));
+	if (dma_mapping_error(priv->device, addr)) {
+		dev_err(priv->device, "dma map error: %s, %d.\n", __func__,
+			__LINE__);
+		__free_pages(page, dn200_rx_pg_order_get(priv));
+		return NULL;
+	}
+	if (dn200_rx_iatu_find(addr, priv, base_addr) < 0) {
+		dev_dbg(priv->device,
+			"%s, %d, addr iatu find failed, dma_addr:%#llx, page phys:%#llx\n",
+			__func__, __LINE__, addr, page_to_phys(page));
+
+		if (page) {
+			__free_pages(page, dn200_rx_pg_order_get(priv));
+			dma_unmap_page_attrs(priv->device, addr,
+					     dn200_rx_pg_size_get(priv),
+					     dma_dir,
+					     (DMA_ATTR_SKIP_CPU_SYNC |
+					      DMA_ATTR_WEAK_ORDERING));
+		}
+
+		page = __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN | GFP_DMA32,
+					 dn200_rx_pg_order_get(priv));
+		if (!page) {
+			dev_err(priv->device,
+				"no page memory: %s, %d, page order:%d\n",
+				__func__, __LINE__,
+				dn200_rx_pg_order_get(priv));
+
+			return NULL;
+		}
+		addr =
+		    dma_map_page_attrs(priv->device, page, 0,
+				       dn200_rx_pg_size_get(priv), dma_dir,
+				       (DMA_ATTR_SKIP_CPU_SYNC |
+					DMA_ATTR_WEAK_ORDERING));
+		if (dma_mapping_error(priv->device, addr)) {
+			dev_err(priv->device, "dma map error: %s, %d.\n",
+				__func__, __LINE__);
+			__free_pages(page, dn200_rx_pg_order_get(priv));
+			return NULL;
+		}
+		/* dma address maybe exceed 32bit for DMA32,
+		 * 1. if not exceed, just use DMA32 iATU
+		 * 2. if exceed, use iATU mapping same as normal memory
+		 */
+		if (dn200_rx_iatu_find(addr, priv, base_addr) < 0) {
+			dev_err(priv->device,
+				"%s, %d, dma32 addr iatu find failed, dma_addr:%#llx, page phys:%#llx\n",
+				__func__, __LINE__, addr, page_to_phys(page));
+			dma_unmap_page_attrs(priv->device, addr,
+					     dn200_rx_pg_size_get(priv),
+					     dma_dir,
+					     (DMA_ATTR_SKIP_CPU_SYNC |
+					      DMA_ATTR_WEAK_ORDERING));
+			__free_pages(page, dn200_rx_pg_order_get(priv));
+			return NULL;
+		}
+	}
+	*dma_addr = addr;
+	return page;
+}
+
+static inline void dn200_free_all_pages(struct dn200_priv *priv)
+{
+	int pg_idx = 0;
+	struct dn200_page_pool *page_pool = &priv->page_pool;
+	enum dma_data_direction dma_dir = DMA_FROM_DEVICE;
+
+	for (pg_idx = 0; pg_idx < page_pool->alloced_pages; ++pg_idx) {
+		if (page_pool->mem_info && page_pool->mem_info[pg_idx].page) {
+			dma_unmap_page_attrs(priv->device,
+					     page_pool->mem_info[pg_idx].dma_addr,
+					     dn200_rx_pg_size_get(priv),
+					     dma_dir,
+					     (DMA_ATTR_SKIP_CPU_SYNC |
+					      DMA_ATTR_WEAK_ORDERING));
+			__page_frag_cache_drain(page_pool->mem_info[pg_idx].page, page_pool->mem_info[pg_idx].page_ref_bias);
+		}
+	}
+	page_pool->alloced_pages = 0;
+}
+
+static int dn200_page_pool_setup(struct dn200_priv *priv)
+{
+	int pg_idx = 0;
+	struct page *page = NULL;
+	struct dn200_page_pool *page_pool = &priv->page_pool;
+	struct dn200_mem_info *mem_info;
+	int pg_num = 0;
+
+	memset(page_pool, 0, sizeof(*page_pool));
+	pg_num = DIV_ROUND_UP(dn200_page_pool_size_get(priv), PAGE_SIZE);
+	dev_dbg(priv->device, "pg num: %s, %d, pg_num:%d.\n", __func__,
+		__LINE__, pg_num);
+
+	mem_info =
+	    vzalloc_node(pg_num * sizeof(struct dn200_mem_info),
+			 priv->numa_node);
+	if (!mem_info)
+		return -ENOMEM;
+
+	page_pool->device = priv->device;
+	page_pool->mem_info = mem_info;
+	page_pool->page_order = dn200_rx_pg_order_get(priv);
+	page_pool->total_pages = pg_num;
+	page_pool->alloced_pages = 0;
+
+	for (pg_idx = 0; pg_idx < pg_num; ++pg_idx) {
+		dma_addr_t dma_addr;
+		dma_addr_t base_addr;
+
+		page = dn200_alloc_page(priv, &dma_addr, &base_addr);
+		if (!page) {
+			dev_err(priv->device,
+				"no page memory: %s, %d, pg_idx:%d.\n",
+				__func__, __LINE__, pg_idx);
+			goto err_no_mem;
+		}
+		page_pool->mem_info[pg_idx].page = page;
+		page_pool->mem_info[pg_idx].dma_addr = dma_addr;
+		page_pool->mem_info[pg_idx].base_addr = base_addr;
+		page_ref_add(page, USHRT_MAX - 1);
+		page_pool->mem_info[pg_idx].page_ref_bias = USHRT_MAX;
+	}
+	page_pool->alloced_pages = pg_num;
+	dev_dbg(priv->device,
+		"page_pool: %s, %d, start pg:0x%p, ord:%d, tot_pgs:%d, tot_size:%ld. alloced_pages %d\n",
+		__func__, __LINE__, page_pool->mem_info[0].page,
+		page_pool->page_order, page_pool->total_pages,
+		page_pool->total_pages * PAGE_SIZE, page_pool->alloced_pages);
+	return 0;
+
+err_no_mem:
+	page_pool->alloced_pages = pg_idx;
+	dev_err(priv->device, "no page memory: %s, %d, request pages:%d.\n",
+		__func__, __LINE__, pg_num);
+	dn200_free_all_pages(priv);
+	return -ENOMEM;
+}
+
+static bool dn200_rx_pool_buf_init(struct dn200_bufpool *rx_pool,
+				   struct dn200_page_buf *page_buf)
+{
+	struct dn200_bufring *pool_ring = rx_pool->pool_ring;
+	u64 buf_addr = (u64) page_buf;
+
+	page_buf->busy_cnt = 0;
+	dn200_bufring_do_init_elem(pool_ring, (void *)&buf_addr, 1);
+	if (page_ref_count(page_buf->page) - *(page_buf->page_ref_bias) != 0) {
+		dev_err(rx_pool->device,
+			" %s dma:%#llx, offset:%d, page_to_phys:%#llx, len:%d, busy_cnt:%d, page:%p, page_ref_count:%d page_ref_bias %d\n",
+			__func__, page_buf->kernel_addr, page_buf->page_offset,
+			page_to_phys(page_buf->page), page_buf->buf_len,
+			page_buf->busy_cnt, page_buf->page,
+			page_ref_count(page_buf->page),
+			*(page_buf->page_ref_bias));
+	}
+
+	return true;
+}
+
+static struct dn200_page_buf *dn200_rx_reserve_buf_init(struct dn200_bufpool *rx_pool,
+							struct dn200_page_buf *page_buf)
+{
+	struct dn200_page_buf *buf;
+
+	page_buf->busy_cnt = 0;
+
+	rx_pool->page_buf[rx_pool->buf_index] = *page_buf;
+	buf = &rx_pool->page_buf[rx_pool->buf_index];
+
+	rx_pool->buf_index++;
+	return buf;
+}
+
+static int dn200_rx_pool_page_add(struct dn200_priv *priv, struct page *page,
+				  dma_addr_t *kernel_dma_addr,
+				  dma_addr_t *desc_dma_addr,
+				  int *page_ref_bias, bool is_reserve)
+{
+	struct dn200_bufpool *rx_pool = &priv->buf_pool;
+	int buf_num_per_pg = PAGE_SIZE / DN200_RX_BUF_SIZE;
+	int buf_idx = 0;
+	struct dn200_page_buf pg_buf;
+	struct dn200_page_buf *buf;
+	int rx_buf_size = DN200_RX_BUF_SIZE;
+
+	for (buf_idx = 0; buf_idx < buf_num_per_pg; ++buf_idx) {
+		memset(&pg_buf, 0, sizeof(pg_buf));
+		pg_buf.buf_addr = page_address(page) + buf_idx * rx_buf_size;
+		pg_buf.buf_len = rx_buf_size;
+		pg_buf.page = page;
+		pg_buf.desc_addr = *desc_dma_addr;
+		pg_buf.kernel_addr = *kernel_dma_addr;
+		pg_buf.page_offset =
+		    buf_idx * rx_buf_size + dn200_rx_offset(priv);
+		pg_buf.busy_cnt = 0;
+		pg_buf.low_res = is_reserve;
+		pg_buf.page_ref_bias = page_ref_bias;
+		buf = dn200_rx_reserve_buf_init(rx_pool, &pg_buf);
+		if (!is_reserve)
+			dn200_rx_pool_buf_init(rx_pool, buf);
+	}
+	return 0;
+}
+
+static int dn200_rx_pool_init(struct dn200_priv *priv)
+{
+	struct dn200_page_pool *page_pool = &priv->page_pool;
+	struct dn200_bufpool *buf_pool = &priv->buf_pool;
+	struct dn200_bufring *buf_ring;
+	struct dn200_bufring *cached_ring = NULL;
+	struct dn200_page_buf *page_buf_arr;
+	int queue;
+	int buf_num_per_pg = PAGE_SIZE / DN200_RX_BUF_SIZE;
+	int buf_num = page_pool->total_pages * buf_num_per_pg;	//rx_buf_pgs;
+	int ring_size;
+	int reserved = priv->plat->rx_queues_to_use * priv->dma_rx_size;
+
+	memset(buf_pool, 0, sizeof(*buf_pool));
+	page_pool->reserve_page = reserved / buf_num_per_pg;
+	dev_dbg(priv->device,
+		"%s reserve page size %d reserved buf %d total buf_num %d\n",
+		__func__, page_pool->reserve_page, reserved, buf_num);
+	if (buf_num < reserved) {
+		dev_err(priv->device,
+			"%s no enough pagebuf! now has %d, %d required at least!\n",
+			__func__, buf_num, reserved);
+		return -ENOMEM;
+	}
+	ring_size = roundup_pow_of_two((buf_num - reserved + 1));
+	buf_ring = vzalloc_node(sizeof(*buf_ring), priv->numa_node);
+	if (!buf_ring) {
+		dev_err(priv->device, "%s failed to alloc buf_ring\n",
+			__func__);
+		return -ENOMEM;
+	}
+	buf_ring->ring_objs =
+	    vzalloc_node(sizeof(u64) * ring_size, priv->numa_node);
+	if (!buf_ring->ring_objs) {
+		dev_err(priv->device, "%s failed to alloc buf_ring ring_objs\n",
+			__func__);
+		vfree(buf_ring);
+		return -ENOMEM;
+	}
+
+	cached_ring = vzalloc_node(sizeof(*cached_ring), priv->numa_node);
+	if (!cached_ring)
+		goto free_buf_ring;
+	cached_ring->ring_objs =
+	    vzalloc_node(sizeof(u64) * ring_size, priv->numa_node);
+	if (!cached_ring->ring_objs) {
+		dev_err(priv->device,
+			"%s failed to alloc cached_ring ring_objs\n", __func__);
+		vfree(cached_ring);
+		goto free_buf_ring;
+	}
+
+	buf_ring->ring_size = ring_size;
+	buf_ring->ring_mask = ring_size - 1;
+	buf_ring->device = priv->device;
+	buf_ring->buf_num_per_page = buf_num_per_pg;
+	atomic_set(&buf_ring->cons.head, 0);
+	atomic_set(&buf_ring->cons.tail, 0);
+	atomic_set(&buf_ring->prod.head, 0);
+	atomic_set(&buf_ring->prod.tail, 0);
+
+	cached_ring->ring_size = ring_size;
+	cached_ring->ring_mask = ring_size - 1;
+	cached_ring->device = priv->device;
+	cached_ring->buf_num_per_page = buf_num_per_pg;
+	atomic_set(&cached_ring->cons.head, 0);
+	atomic_set(&cached_ring->cons.tail, 0);
+	atomic_set(&cached_ring->prod.head, 0);
+	atomic_set(&cached_ring->prod.tail, 0);
+	dev_dbg(priv->device, "%s bufring size %d mask %#x\n", __func__,
+		ring_size, ring_size - 1);
+	page_buf_arr = vzalloc_node(sizeof(*page_buf_arr) * buf_num,
+				    priv->numa_node);
+	if (!page_buf_arr)
+		goto err_out;
+
+	buf_pool->pool_ring = buf_ring;
+	buf_pool->cached_ring = cached_ring;
+	buf_pool->buf_num_per_page = buf_num_per_pg;
+	buf_pool->pool_size = buf_num - reserved;
+	buf_pool->reserved_start = buf_pool->pool_size;
+	buf_pool->buf_index = 0;
+	buf_pool->device = priv->device;
+	buf_pool->page_buf = page_buf_arr;
+	buf_pool->cache_size =
+	    min_t(unsigned int, DN200_BUFPOOL_CACHE_MAX_SIZE, priv->dma_rx_size);
+
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		rx_q->rx_pool = buf_pool;
+	}
+	return 0;
+
+err_out:
+	vfree(cached_ring->ring_objs);
+	vfree(cached_ring);
+free_buf_ring:
+	vfree(buf_ring->ring_objs);
+	vfree(buf_ring);
+	return -ENOMEM;
+}
+
+static int dn200_local_cache_init(struct dn200_priv *priv)
+{
+	struct dn200_bufpool_cache *local_cache;
+	struct dn200_buf_refill_stack *buf_refill;
+	struct dn200_buf_cache_ring *buf_cached;
+	int queue = 0;
+
+	local_cache = vzalloc_node(sizeof(*local_cache) *
+					   priv->plat->rx_queues_to_use,
+				   priv->numa_node);
+	if (!local_cache)
+		return -ENOMEM;
+	priv->buf_pool.local_cache = local_cache;
+
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
+		/*INIT refill cache, stack model */
+		buf_refill = &local_cache->buf_refill;
+		buf_refill->cache_size =
+		    roundup_pow_of_two(priv->buf_pool.cache_size);
+		buf_refill->current_len = 0;
+		/*INIT cache ring, sp_sc ring model */
+		buf_cached = &local_cache->buf_cached;
+		buf_cached->cache_size = DN200_BUFPOOL_CACHE_MAX_SIZE * 4;
+		buf_cached->head = 0;
+		buf_cached->tail = 0;
+		buf_cached->cache_mask = buf_cached->cache_size - 1;
+
+		buf_refill->device = priv->device;
+		buf_cached->device = priv->device;
+		local_cache++;
+	}
+
+	return 0;
+}
+
+static int dn200_rx_pool_alloc(struct dn200_priv *priv)
+{
+	int pg_idx = 0;
+	struct dn200_page_pool *page_pool = NULL;
+	dma_addr_t kernel_dma_addr;
+	dma_addr_t desc_dma_addr;
+	struct page *page = NULL;
+
+	page_pool = &priv->page_pool;
+	for (pg_idx = 0;
+	     pg_idx < page_pool->total_pages - page_pool->reserve_page;
+	     pg_idx++) {
+		page = page_pool->mem_info[pg_idx].page;
+		desc_dma_addr = page_pool->mem_info[pg_idx].base_addr;
+		kernel_dma_addr = page_pool->mem_info[pg_idx].dma_addr;
+		if (!page) {
+			dev_err(priv->device,
+				"page is null: %s, %d, pg_idx:%d.\n", __func__,
+				__LINE__, pg_idx);
+			return -ENOMEM;
+		}
+		dn200_rx_pool_page_add(priv, page, &kernel_dma_addr,
+				       &desc_dma_addr,
+					   &page_pool->mem_info[pg_idx].page_ref_bias,
+					   false);
+	}
+
+	for (; pg_idx < page_pool->total_pages; pg_idx++) {
+		page = page_pool->mem_info[pg_idx].page;
+		desc_dma_addr = page_pool->mem_info[pg_idx].base_addr;
+		kernel_dma_addr = page_pool->mem_info[pg_idx].dma_addr;
+		if (!page) {
+			dev_err(priv->device,
+				"page is null: %s, %d, pg_idx:%d.\n", __func__,
+				__LINE__, pg_idx);
+			return -ENOMEM;
+		}
+		dn200_rx_pool_page_add(priv, page, &kernel_dma_addr,
+				       &desc_dma_addr,
+					   &page_pool->mem_info[pg_idx].page_ref_bias,
+					   true);
+	}
+	dev_dbg(priv->device, "%s put %d page_buf to buf_pool buf_ring %d\n",
+		__func__, priv->buf_pool.buf_index,
+		atomic_read(&priv->buf_pool.pool_ring->prod.head));
+
+	return 0;
+}
+
+int dn200_rx_pool_setup(struct dn200_priv *priv)
+{
+	if (dn200_page_pool_setup(priv))
+		return -ENOMEM;
+
+	if (dn200_rx_pool_init(priv))
+		return -ENOMEM;
+
+	if (dn200_rx_pool_alloc(priv))
+		return -ENOMEM;
+
+	if (dn200_local_cache_init(priv))
+		return -ENOMEM;
+	return 0;
+}
+
+void dn200_rx_pool_destory(struct dn200_priv *priv)
+{
+	struct dn200_page_pool *page_pool = &priv->page_pool;
+	struct dn200_bufpool *buf_pool = &priv->buf_pool;
+	struct dn200_bufring *buf_ring = buf_pool->pool_ring;
+	struct dn200_bufring *cached_ring = buf_pool->cached_ring;
+	int queue;
+
+	if (priv->buf_pool.local_cache) {
+		vfree(priv->buf_pool.local_cache);
+		priv->buf_pool.local_cache = NULL;
+	}
+	if (buf_pool->page_buf) {
+		vfree(buf_pool->page_buf);
+		buf_pool->page_buf = NULL;
+	}
+	if (buf_ring) {
+		vfree(buf_ring->ring_objs);
+		vfree(buf_ring);
+		buf_pool->pool_ring = NULL;
+	}
+	if (cached_ring) {
+		vfree(cached_ring->ring_objs);
+		vfree(cached_ring);
+		buf_pool->cached_ring = NULL;
+	}
+	dn200_free_all_pages(priv);
+
+	for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) {
+		struct dn200_rx_queue *rx_q = &priv->rx_queue[queue];
+
+		rx_q->rx_pool = NULL;
+	}
+
+	if (page_pool->mem_info) {
+		vfree(page_pool->mem_info);
+		page_pool->mem_info = NULL;
+	}
+
+	memset(&priv->page_pool, 0, sizeof(priv->page_pool));
+	memset(&priv->buf_pool, 0, sizeof(priv->buf_pool));
+}
+
+struct page *dn200_alloc_dma_page_dir(struct dn200_priv *priv,
+				    dma_addr_t *dma_addr, enum dma_data_direction dma_dir)
+{
+	struct page *page = NULL;
+	dma_addr_t base_addr;
+	dma_addr_t addr = 0;
+
+
+	page = alloc_pages_node(priv->numa_node,
+		GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO | __GFP_COMP | __GFP_MEMALLOC, 0);
+	if (!page) {
+		dev_err(priv->device, "no page memory: %s, %d.\n", __func__, __LINE__);
+		return NULL;
+	}
+
+	addr = dma_map_page_attrs(priv->device, page, 0, PAGE_SIZE, dma_dir,
+			       (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+	if (dma_mapping_error(priv->device, addr)) {
+		dev_err(priv->device, "dma map error: %s, %d.\n", __func__, __LINE__);
+		__free_pages(page, 0);
+		return NULL;
+	}
+
+	if (priv->plat_ex->raid_supported)
+		goto result;
+
+	/*judge addr exceed 40 bit or 32-40bit should not equal 0xE0 ~ 0xFF*/
+	if ((addr >> 40) || (!(addr & GENMASK(39, 37))))
+		goto result;
+
+	if (test_bit(DN200_IATU_INIT, &priv->state)) {
+		if (dn200_rx_iatu_find(addr, priv, &base_addr) < 0) {
+			dev_dbg(priv->device,
+				"%s, %d, addr iatu find failed, dma_addr:%#llx, page phys:%#llx\n",
+				__func__, __LINE__, addr, page_to_phys(page));
+
+			if (page) {
+				__free_pages(page, 0);
+				dma_unmap_page_attrs(priv->device, addr, PAGE_SIZE, dma_dir,
+							(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+		}
+
+			page = __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN | GFP_DMA32, 0);
+			if (!page) {
+				dev_err(priv->device, "no page memory: %s, %d, page order:%d\n",
+					__func__, __LINE__, 0);
+
+				return NULL;
+			}
+			addr = dma_map_page_attrs(priv->device, page, 0, PAGE_SIZE, dma_dir,
+						(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+			if (dma_mapping_error(priv->device, addr)) {
+				dev_err(priv->device, "dma map error: %s, %d.\n",
+					__func__, __LINE__);
+				__free_pages(page, 0);
+				return NULL;
+			}
+			/* dma address maybe exceed 32bit for DMA32,
+			 * 1. if not exceed, just use DMA32 iATU
+			 * 2. if exceed, use iATU mapping same as normal memory
+			 */
+			if (dn200_rx_iatu_find(addr, priv, &base_addr) < 0) {
+				dev_err(priv->device,
+					"%s, %d, dma32 addr iatu find failed, dma_addr:%#llx, page phys:%#llx\n",
+					__func__, __LINE__, addr, page_to_phys(page));
+				dma_unmap_page_attrs(priv->device, addr, PAGE_SIZE, dma_dir,
+							(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+				__free_pages(page, 0);
+				return NULL;
+
+			}
+		}
+	} else {
+		if (!(addr >> MAX_LIMIT_RANGE_SHIFT))
+			goto result;
+
+		if (page) {
+			__free_pages(page, 0);
+			dma_unmap_page_attrs(priv->device, addr, PAGE_SIZE, dma_dir,
+							(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+		}
+		page = __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN | GFP_DMA32, 0);
+		if (!page) {
+			dev_err(priv->device, "no page memory: %s, %d, page order:%d\n",
+				__func__, __LINE__, 0);
+
+			return NULL;
+		}
+		addr = dma_map_page_attrs(priv->device, page, 0, PAGE_SIZE, dma_dir,
+						(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+		if (dma_mapping_error(priv->device, addr)) {
+			dev_err(priv->device, "dma map error: %s, %d.\n",
+				__func__, __LINE__);
+			__free_pages(page, 0);
+			return NULL;
+		}
+	}
+
+result:
+	*dma_addr = addr;
+	return page;
+}
+
+void dn200_free_dma_page_dir(struct dn200_priv *priv,
+					struct page *page, dma_addr_t dma_addr,
+					enum dma_data_direction dma_dir)
+{
+
+	dma_unmap_page_attrs(priv->device, dma_addr, PAGE_SIZE, dma_dir,
+			     (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING));
+	__free_pages(page, 0);
+}
+
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_pool.h b/drivers/net/ethernet/dapustor/dn200/dn200_pool.h
new file mode 100644
index 000000000000..ce3d82ace342
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_pool.h
@@ -0,0 +1,822 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config dn200 ethernet buf pool
+ */
+
+#ifndef __DN200_BUFPOOL_H__
+#define __DN200_BUFPOOL_H__
+#include "common.h"
+
+#define DN200_RX_BUF_SIZE 2048
+#define DN200_DEFAULT_PAGE_SIZE 4096
+#define DN200_MT_32K_MAX_BUF_SIZE (256 * 1024)
+#define DN200_4K_MAX_BUF_SIZE (128 * 1024)
+#define DN200_DEFAULT_PAGE_SIZE_SHIFT 12
+#define DN200_BUFPOOL_CACHE_MAX_SIZE 512
+#define DN200_DEFAULT_CACHE_SIZE 128
+#define DN200_DEFAULT_CACHE_TH 128
+#define DN200_RECY_MIN_CACHE_SIZE 32
+
+#define dn200_bufpool_align __aligned(sizeof(long))
+
+/** prod/cons sync types */
+enum dn200_ring_sync_type {
+	DN200_RING_SYNC_MT, /**< multi-thread safe (default mode) */
+	DN200_RING_SYNC_ST, /**< single thread only */
+	DN200_RING_SYNC_MT_RTS, /**< multi-thread relaxed tail sync */
+	DN200_RING_SYNC_MT_HTS, /**< multi-thread head/tail sync */
+};
+
+struct dn200_bufring_headtail {
+	atomic_t head; /**< prod/consumer head. */
+	atomic_t tail; /**< prod/consumer tail. */
+} dn200_bufpool_align;
+
+struct dn200_page_buf {
+	struct page *page;
+	int *page_ref_bias;
+	dma_addr_t desc_addr;
+	dma_addr_t kernel_addr;
+	__u32 page_offset;
+	void *buf_addr;
+	u16 buf_len;
+	u16 busy_cnt;
+	bool low_res; /* get buf from pool with low resource state */
+};
+
+struct dn200_bufring {
+	u32 ring_size; /**< Size of ring. */
+	u32 ring_mask; /**< Mask (size-1) of ring. */
+	struct device *device;
+	u32 buf_num_per_page;
+
+	char pad0 dn200_bufpool_align; /**< empty cache line */
+	/** Ring producer status. */
+	struct dn200_bufring_headtail prod;
+
+	char pad1 dn200_bufpool_align; /**< empty cache line */
+	/** Ring consumer status. */
+	struct dn200_bufring_headtail cons;
+
+	char pad2 dn200_bufpool_align; /**< empty cache line */
+	u64 *ring_objs; /*Element pointer */
+};
+
+struct dn200_buf_cache_ring {
+	u32 cache_size; /**< Size of the cache, pow of 2 */
+	u32 cache_mask; /*cache_size - 1 */
+	u32 head;
+	u32 tail;
+	struct device *device;
+	char pad2 dn200_bufpool_align; /**< empty cache line */
+	/**
+	 * Cache objects
+	 *
+	 * Cache is allocated to this size to allow it to overflow in certain
+	 * cases to avoid needless emptying of cache.
+	 */
+	u64 *objs[DN200_BUFPOOL_CACHE_MAX_SIZE * 4] dn200_bufpool_align;
+	u64 *tmp_objs[DN200_BUFPOOL_CACHE_MAX_SIZE] dn200_bufpool_align;
+} dn200_bufpool_align;
+
+struct dn200_buf_refill_stack {
+	u32 cache_size; /**< Size of the cache, pow of 2 */
+	u32 current_len; /**< Current cache count */
+	bool use_reserve;
+	struct device *device;
+	char pad2 dn200_bufpool_align; /**< empty cache line */
+	/**
+	 * Cache objects
+	 *
+	 * Cache is allocated to this size to allow it to overflow in certain
+	 * cases to avoid needless emptying of cache.
+	 */
+	u64 *objs[DN200_BUFPOOL_CACHE_MAX_SIZE * 2] dn200_bufpool_align;
+} dn200_bufpool_align;
+
+struct dn200_bufpool_cache {
+	/*buf stored at buf_cached ring firstly, and
+	 * than check that the page ref count is 1 and send it to the buf_refill
+	 */
+	struct dn200_buf_cache_ring buf_cached;
+	struct dn200_buf_refill_stack buf_refill;
+};
+
+struct dn200_bufpool {
+	int buf_num_per_page; /**< Flags supplied at creation. */
+	u32 pool_size; /**< Max size of the bufpool. */
+	u32 cache_size; /*Max size of cache_size per queue */
+	u32 reserved_start;
+	u32 buf_index;
+	struct device *device;
+	char pad0 dn200_bufpool_align; /**< empty cache line */
+	spinlock_t lock;
+	char pad1 dn200_bufpool_align; /**< empty cache line */
+	char pad2 dn200_bufpool_align; /**< empty cache line */
+	struct dn200_bufpool_cache *local_cache; /**< Per-queue local cache */
+	struct dn200_bufring *pool_ring;
+	struct dn200_bufring *cached_ring;
+	struct dn200_page_buf *page_buf; /*Stores the page buf structure */
+};
+
+bool dma_can_direct_use(struct dn200_priv *priv, dma_addr_t dma_addr);
+int dn200_rx_pool_setup(struct dn200_priv *priv);
+void dn200_rx_pool_destory(struct dn200_priv *priv);
+
+static __always_inline unsigned int
+dn200_update_ringindex(atomic_t *v, int old_val, int new_val, bool is_single)
+{
+	return atomic_cmpxchg(v, old_val, new_val);
+}
+
+static __always_inline unsigned int
+dn200_bufring_move_prod_head(struct dn200_bufring *r, unsigned int is_sp,
+			     unsigned int n, int *old_head, int *new_head)
+{
+	unsigned int max = n;
+	unsigned int free_entry = n;
+	int cons_tail = 0;
+
+	do {
+		/* Reset n to the initial burst count */
+		n = max;
+
+		*old_head = atomic_read(&r->prod.head);
+		cons_tail = atomic_read(&r->cons.tail);
+		if (*old_head > cons_tail)
+			free_entry = r->ring_size + cons_tail - *old_head - 1;
+		else
+			free_entry = cons_tail - *old_head - 1;
+
+		if (n > free_entry)
+			return 0;
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		smp_rmb();
+
+		*new_head = (*old_head + n) & r->ring_mask;
+	} while (unlikely(dn200_update_ringindex(&r->prod.head, *old_head,
+						 *new_head,
+						 is_sp) != *old_head));
+	return n;
+}
+
+static __always_inline unsigned int
+dn200_bufring_move_cons_head(struct dn200_bufring *r,
+			     unsigned int n, int *old_head, int *new_head,
+			     bool is_sc)
+{
+	unsigned int max = n;
+	int free_entries = 0;
+	int prod_tail = 0;
+
+	/* move cons.head atomically */
+	do {
+		/* Restore n as it may change every loop */
+		n = max;
+
+		*old_head = atomic_read(&r->cons.head);
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		smp_rmb();
+		prod_tail = atomic_read(&r->prod.tail);
+		if (likely(prod_tail >= *old_head))
+			free_entries = prod_tail - *old_head;
+		else
+			free_entries = r->ring_size + prod_tail - *old_head;
+		/* Set the actual entries for dequeue */
+		if (n > free_entries)
+			n = free_entries;
+
+		if (unlikely(n == 0 || (n % r->buf_num_per_page) != 0))
+			return 0;
+
+		*new_head = (*old_head + n) & r->ring_mask;
+
+	} while (unlikely(dn200_update_ringindex(&r->cons.head, *old_head,
+						 *new_head,
+						 is_sc) != *old_head));
+	return n;
+}
+
+static __always_inline void
+dn200_bufring_enqueue_elems_64(struct dn200_bufring *r, u32 prod_head,
+			       const void *obj_table, u32 n)
+{
+	unsigned int i;
+	const u32 size = r->ring_size;
+	u32 idx = prod_head & r->ring_mask;
+	u64 *ring_objs = (u64 *)r->ring_objs;
+	const u64 *obj = (const u64 *)obj_table;
+
+	if (likely(idx + n <= size)) {
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {
+			ring_objs[idx] = obj[i];
+			ring_objs[idx + 1] = obj[i + 1];
+			ring_objs[idx + 2] = obj[i + 2];
+			ring_objs[idx + 3] = obj[i + 3];
+		}
+		switch (n & 0x3) {
+		case 3:
+			ring_objs[idx++] = obj[i++];
+			fallthrough;
+		case 2:
+			ring_objs[idx++] = obj[i++];
+			fallthrough;
+		case 1:
+			ring_objs[idx++] = obj[i++];
+		}
+	} else {
+		for (i = 0; idx < size; i++, idx++)
+			ring_objs[idx] = obj[i];
+		/* Start at the beginning */
+		for (idx = 0; i < n; i++, idx++)
+			ring_objs[idx] = obj[i];
+	}
+
+}
+
+static __always_inline void
+dn200_bufring_dequeue_elems_64(struct dn200_bufring *r, u32 prod_head,
+			       void *obj_table, u32 n)
+{
+	unsigned int i;
+	const u32 size = r->ring_size;
+	u32 idx = prod_head & r->ring_mask;
+	u64 *ring_objs = (u64 *)r->ring_objs;
+	u64 *obj = (u64 *)obj_table;
+
+	if (likely(idx + n <= size)) {
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {
+			obj[i] = (ring_objs[idx]);
+			obj[i + 1] = (ring_objs[idx + 1]);
+			obj[i + 2] = (ring_objs[idx + 2]);
+			obj[i + 3] = (ring_objs[idx + 3]);
+		}
+		switch (n & 0x3) {
+		case 3:
+			obj[i++] = (ring_objs[idx++]);
+			fallthrough;
+		case 2:
+			obj[i++] = (ring_objs[idx++]);
+			fallthrough;
+		case 1:
+			obj[i++] = (ring_objs[idx++]);
+		}
+	} else {
+		for (i = 0; idx < size; i++, idx++)
+			obj[i] = (ring_objs[idx]);
+		/* Start at the beginning */
+		for (idx = 0; i < n; i++, idx++)
+			obj[i] = (ring_objs[idx]);
+	}
+}
+
+static __always_inline void dn200_bufring_enqueue_elems(struct dn200_bufring *r,
+							u32 prod_head,
+							const void *obj_table,
+							u32 num)
+{
+	dn200_bufring_enqueue_elems_64(r, prod_head, obj_table, num);
+}
+
+static __always_inline void dn200_bufring_dequeue_elems(struct dn200_bufring *r,
+							u32 cons_head,
+							void *obj_table,
+							u32 num)
+{
+	/* 8B and 16B copies implemented individually to retain
+	 * the current performance.
+	 */
+	dn200_bufring_dequeue_elems_64(r, cons_head, obj_table, num);
+}
+
+static __always_inline void
+dn200_bufpool_ring_update_tail(struct dn200_bufring_headtail *ht, int old_val,
+			       int new_val, u32 single, u32 enqueue)
+{
+	if (enqueue)
+		/* add wmb barrier to avoid head not update*/
+		smp_wmb();
+	else
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		smp_rmb();
+
+	/* If there are other enqueues/dequeues in progress that preceded us,
+	 * we need to wait for them to complete
+	 */
+	if (!single) {
+		while (atomic_cmpxchg(&ht->tail, old_val, new_val) != old_val)
+			ndelay(1);
+	} else {
+		atomic_set(&ht->tail, new_val);
+	}
+}
+
+static __always_inline unsigned int
+dn200_bufring_do_dequeue_elem(struct dn200_bufring *r, void *obj_table,
+			      unsigned int n, unsigned int is_sc)
+{
+	int cons_head, cons_next;
+
+	if (unlikely(n == 0))
+		return 0;
+
+	n = dn200_bufring_move_cons_head(r, n, &cons_head, &cons_next, 0);
+	if (n == 0)
+		goto end;
+
+	dn200_bufring_dequeue_elems(r, cons_head, obj_table, n);
+
+	dn200_bufpool_ring_update_tail(&r->cons, cons_head, cons_next, is_sc,
+				       0);
+end:
+	return n;
+}
+
+static __always_inline unsigned int
+dn200_bufring_do_enqueue_elem(struct dn200_bufring *r, const void *obj_table,
+			      unsigned int n, unsigned int is_sp)
+{
+	int prod_head, prod_next;
+
+	if (unlikely(n == 0))
+		return 0;
+
+	n = dn200_bufring_move_prod_head(r, is_sp, n, &prod_head, &prod_next);
+	if (n == 0)
+		return 0;
+
+	dn200_bufring_enqueue_elems(r, prod_head, obj_table, n);
+
+	dn200_bufpool_ring_update_tail(&r->prod, prod_head, prod_next, is_sp,
+				       1);
+
+	return n;
+}
+static __always_inline unsigned int
+dn200_bufring_do_init_elem(struct dn200_bufring *r, const void *obj_table,
+			   unsigned int n)
+{
+	int prod_head, prod_next;
+
+	if (unlikely(n == 0))
+		return 0;
+
+	n = dn200_bufring_move_prod_head(r, 0, n, &prod_head, &prod_next);
+	if (n == 0)
+		goto end;
+
+	dn200_bufring_enqueue_elems(r, prod_head, obj_table, n);
+
+	dn200_bufpool_ring_update_tail(&r->prod, prod_head, prod_next, 0, 1);
+end:
+	return n;
+}
+
+static inline unsigned int
+dn200_bufring_dequeue_elem(struct dn200_bufpool *rx_pool, void *obj_table,
+			   unsigned int n, unsigned int is_sc)
+{
+	int i = 0;
+	int max = n;
+	int recy_count = 0;
+	u64 *tmp_objs;
+	int stride = rx_pool->buf_num_per_page;
+	u64 *list_objs;
+
+	n = dn200_bufring_do_dequeue_elem(rx_pool->pool_ring, obj_table, max,
+					  is_sc);
+	if (likely(n))
+		return n;
+
+	tmp_objs =
+	    kzalloc(sizeof(u64) * DN200_BUFPOOL_CACHE_MAX_SIZE, GFP_ATOMIC);
+	if (!tmp_objs)
+		return 0;
+
+	list_objs = kcalloc(stride, sizeof(u64), GFP_ATOMIC);
+	if (!list_objs) {
+		kfree(tmp_objs);
+		return 0;
+	}
+recy:
+	recy_count =
+	    dn200_bufring_do_dequeue_elem(rx_pool->cached_ring,
+					  (void *)tmp_objs,
+					  DN200_BUFPOOL_CACHE_MAX_SIZE, is_sc);
+	if (unlikely(!recy_count))
+		goto free;
+
+	for (i = 0; i < recy_count; i += stride) {
+		struct dn200_page_buf *buf =
+		    (struct dn200_page_buf *)tmp_objs[i];
+
+		if (((page_ref_count(buf->page) - *(buf->page_ref_bias))) != 0)
+			break;
+	}
+	if (likely(i))
+		dn200_bufring_do_enqueue_elem(rx_pool->pool_ring,
+					      (void *)tmp_objs, i, is_sc);
+
+	if (i < recy_count)
+		dn200_bufring_do_enqueue_elem(rx_pool->cached_ring,
+					      (void *)(tmp_objs + i),
+					      recy_count - i, is_sc);
+	else
+		goto recy;
+
+free:
+	n = dn200_bufring_do_dequeue_elem(rx_pool->pool_ring, obj_table, max,
+					  is_sc);
+	kfree(tmp_objs);
+	kfree(list_objs);
+	return n;
+}
+
+#define DN200_RX_CACHE_MIN_SIZE	512
+
+static __always_inline bool
+dn200_rx_pool_buf_put(struct dn200_bufpool *rx_pool,
+		      struct dn200_page_buf *page_buf)
+{
+	struct dn200_bufring *pool_ring = rx_pool->pool_ring;
+
+	page_buf->busy_cnt = 0;
+	dn200_bufring_do_enqueue_elem(pool_ring, (void *)page_buf, 1,
+				      (int)DN200_RING_SYNC_MT);
+	return true;
+}
+
+static __always_inline unsigned int
+dn200_page_buf_free(struct dn200_buf_cache_ring *buf_cached)
+{
+	if (buf_cached->tail > buf_cached->head) {
+		return buf_cached->tail - buf_cached->head - 1;
+	} else {
+		return buf_cached->cache_size + buf_cached->tail -
+		    buf_cached->head - 1;
+	}
+}
+
+static __always_inline u32
+dn200_page_buf_avail(struct dn200_buf_cache_ring *buf_cached)
+{
+	if (buf_cached->tail <= buf_cached->head) {
+		return buf_cached->head - buf_cached->tail;
+	} else {
+		return buf_cached->cache_size - (buf_cached->tail -
+						 buf_cached->head);
+	}
+}
+
+static inline int dn200_cache_enqueue2pool(struct dn200_bufpool *rx_pool,
+					   u8 queue_id, u32 max)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	int stride = rx_pool->buf_num_per_page;
+	struct dn200_buf_cache_ring *buf_cached = &local_cache->buf_cached;
+	u64 *objs = (u64 *)buf_cached->objs;
+	int index = 0, i = 0;
+	u32 avail = dn200_page_buf_avail(buf_cached);
+
+	max = min(avail, max);
+	max = ALIGN(max, stride);
+	if (max < stride)
+		return 0;
+
+	for (index = buf_cached->tail, i = 0;
+	     i < max && index < buf_cached->cache_size;
+	     index += stride, i += stride) {
+		struct dn200_page_buf *buf =
+		    (struct dn200_page_buf *)objs[index];
+
+		if ((page_ref_count(buf->page) - *(buf->page_ref_bias)) != 0)
+			goto ref_unavail;
+	}
+
+ref_unavail:
+	if (i) {
+		objs = (u64 *)&buf_cached->objs[buf_cached->tail];
+		dn200_bufring_do_enqueue_elem(rx_pool->pool_ring, (void *)objs,
+					      i, 0);
+		buf_cached->tail = index & buf_cached->cache_mask;
+	}
+	dev_dbg(rx_pool->device,
+		"%s recycle count %d to refill bufstack now recycle tail %d buf_cached->head %d\n",
+		__func__, i, index & buf_cached->cache_mask, buf_cached->head);
+	return i;
+}
+
+static int dn200_cache_buf_recycle_em(struct dn200_bufpool *rx_pool,
+				      u8 queue_id, u32 max)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	int stride = rx_pool->buf_num_per_page;
+	struct dn200_buf_cache_ring *buf_cached = &local_cache->buf_cached;
+	struct dn200_buf_refill_stack *buf_refill = &local_cache->buf_refill;
+	u64 *objs = (u64 *)buf_cached->objs;
+	u64 *tmp_objs = (u64 *)buf_cached->tmp_objs;
+	u64 *refill_objs = (u64 *)&buf_refill->objs[0];
+	int index = 0, i = 0, j;
+	int unavail_count = 0, refill_count = 0;
+	u32 avail = dn200_page_buf_avail(buf_cached);
+
+	avail = (avail >= stride) ? (avail - stride) : 0;
+	max = min_t(u32, avail, max);
+	max = ALIGN(max, stride);
+
+	max = min_t(u32, max, (u32)DN200_BUFPOOL_CACHE_MAX_SIZE);
+	if (max < stride)
+		return 0;
+
+	for (index = buf_cached->tail, i = 0;
+	     i < max && index < buf_cached->cache_size; index += stride) {
+		struct dn200_page_buf *buf =
+		    (struct dn200_page_buf *)objs[index];
+
+		if (unlikely((page_ref_count(buf->page) - *(buf->page_ref_bias)) != 0)) {
+			for (j = 0; j < stride; j++)
+				tmp_objs[unavail_count++] = objs[index + j];
+		} else {
+			for (j = 0; j < stride; j++)
+				refill_objs[refill_count++] = objs[index + j];
+		}
+		i += stride;
+	}
+	dev_dbg(rx_pool->device,
+		"%s recycle count %d to refill bufstack %d to cache ring .now recycle tail %d, head %d cache mask %d\n",
+		__func__, refill_count, unavail_count,
+		index & buf_cached->cache_mask, buf_cached->head,
+		buf_cached->cache_mask);
+
+	buf_refill->current_len = refill_count;
+	buf_cached->tail = index & buf_cached->cache_mask;
+	if (unavail_count)
+		dn200_bufring_do_enqueue_elem(rx_pool->cached_ring,
+					      (void *)tmp_objs, unavail_count,
+					      0);
+
+	if (unlikely(dn200_page_buf_avail(buf_cached) >
+		     DN200_BUFPOOL_CACHE_MAX_SIZE))
+		dn200_cache_enqueue2pool(rx_pool, queue_id,
+					 dn200_page_buf_avail(buf_cached) -
+					 DN200_BUFPOOL_CACHE_MAX_SIZE);
+
+
+	return buf_refill->current_len;
+}
+
+static inline int dn200_cache_buf_recycle(struct dn200_bufpool *rx_pool,
+					  u8 queue_id, u32 max)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	int stride = rx_pool->buf_num_per_page;
+	struct dn200_buf_refill_stack *buf_refill = &local_cache->buf_refill;
+	struct dn200_buf_cache_ring *buf_cached = &local_cache->buf_cached;
+	u64 *refill_objs = (u64 *)&buf_refill->objs[0];
+	u64 *objs = (u64 *)buf_cached->objs;
+	int index = 0, i = 0, j;
+	u32 avail = dn200_page_buf_avail(buf_cached);
+
+	if (unlikely(avail <= DN200_RECY_MIN_CACHE_SIZE))
+		return 0;
+
+	if (unlikely(avail > DN200_BUFPOOL_CACHE_MAX_SIZE))
+		return dn200_cache_buf_recycle_em(rx_pool, queue_id, max);
+
+	avail = (avail >= stride) ? (avail - stride) : 0;
+	max = min(avail, max);
+	max = ALIGN(max, stride);
+	if (max < stride)
+		return 0;
+
+	if (likely((max + buf_cached->tail) < buf_cached->cache_size)) {
+		for (index = buf_cached->tail, i = 0; i < max;
+		     index += stride, i += stride) {
+			struct dn200_page_buf *buf =
+			    (struct dn200_page_buf *)objs[index];
+
+			if ((page_ref_count(buf->page) - *(buf->page_ref_bias)) != 0)
+				goto ref_unavail;
+
+			for (j = 0; j < stride; j++)
+				refill_objs[i + j] = objs[index + j];
+
+		}
+	} else {
+		for (index = buf_cached->tail, i = 0;
+		     index < buf_cached->cache_size;
+		     index += stride, i += stride) {
+			struct dn200_page_buf *buf =
+			    (struct dn200_page_buf *)objs[index];
+
+			if ((page_ref_count(buf->page) - *(buf->page_ref_bias)) != 0)
+				goto ref_unavail;
+
+			for (j = 0; j < stride; j++)
+				refill_objs[i + j] = objs[index + j];
+		}
+		for (index = 0; i < max; index += stride, i += stride) {
+			struct dn200_page_buf *buf =
+			    (struct dn200_page_buf *)objs[index];
+
+			if ((page_ref_count(buf->page) - *(buf->page_ref_bias)) != 0)
+				goto ref_unavail;
+
+			for (j = 0; j < stride; j++)
+				refill_objs[i + j] = objs[index + j];
+		}
+	}
+
+ref_unavail:
+	dev_dbg(rx_pool->device,
+		"%s recycle count %d to refill bufstack now recycle tail %d head %d cache_mask %d cache_size %d\n",
+		__func__, i, index & buf_cached->cache_mask, buf_cached->head,
+		buf_cached->cache_mask, buf_cached->cache_size);
+	buf_cached->tail = index & buf_cached->cache_mask;
+	buf_refill->current_len = i;
+	return i;
+}
+
+#define DN200_USE_RESVERED_BUF 1
+static inline int dn200_rx_pool_buf_alloc_n(struct dn200_bufpool *rx_pool,
+					    u8 queue_id, u8 count, void **buf)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	struct dn200_buf_refill_stack *buf_refill = &local_cache->buf_refill;
+	u64 *cache_objs;
+	u64 *buf_objs = (u64 *)buf;
+	int ret, recy;
+	u32 index, remaining = 0;
+	u32 max = count, n;
+
+	if (unlikely(max != 1)) {
+		dev_warn(rx_pool->device, "%s only one buf can be alloced for at a time! queue %d\n",
+			__func__, queue_id);
+		return 0;
+	}
+	n = min(buf_refill->current_len, max);
+	remaining = max - n;
+
+	if (n == 0)
+		goto get_remaining;
+
+	cache_objs = (u64 *)&buf_refill->objs[buf_refill->current_len - 1];
+	for (index = 0; index < n; index++)
+		*buf_objs = *cache_objs;
+
+	buf_refill->current_len -= n;
+	if (remaining == 0)
+		return 0;
+
+
+get_remaining:
+	recy = dn200_cache_buf_recycle(rx_pool, queue_id,
+				       buf_refill->cache_size);
+	if (recy)
+		goto put_remaining;
+
+	/* Fill the cache from the backend; fetch size + remaining objects. */
+	ret = dn200_bufring_dequeue_elem(rx_pool, buf_refill->objs,
+					 buf_refill->cache_size, 0);
+	if (unlikely(ret <= 0)) {
+		dev_dbg(rx_pool->device, "%s get buf from buf_ring failed\n",
+			__func__);
+		return DN200_USE_RESVERED_BUF;
+	}
+	buf_refill->current_len = ret;
+
+put_remaining:
+	/* Satisfy the remaining part of the request from the filled cache. */
+	cache_objs = (u64 *)&buf_refill->objs[buf_refill->current_len - 1];
+	n = min(buf_refill->current_len, remaining);
+	remaining -= n;
+	for (index = 0; index < n; index++)
+		*buf_objs = *cache_objs;
+
+	buf_refill->current_len -= n;
+	if (unlikely(remaining))
+		goto get_remaining;
+
+	return 0;
+}
+
+static inline struct dn200_page_buf *
+dn200_rx_pool_buf_alloc(struct dn200_bufpool *rx_pool, u8 queue_id, int offset,
+			int dma_rx_size)
+{
+	struct dn200_page_buf *buf = NULL;
+	int ret = 0;
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	struct dn200_buf_refill_stack *buf_refill = &local_cache->buf_refill;
+
+	if (unlikely(buf_refill->use_reserve)) {
+		if (offset & (rx_pool->buf_num_per_page - 1)) {
+			return &rx_pool->page_buf[rx_pool->reserved_start +
+						  queue_id * dma_rx_size +
+						  offset];
+		} else {
+			buf_refill->use_reserve = false;
+		}
+	}
+
+	ret = dn200_rx_pool_buf_alloc_n(rx_pool, queue_id, 1, (void **)&buf);
+	if (unlikely(ret == DN200_USE_RESVERED_BUF || !buf)) {
+		dev_dbg(rx_pool->device, "%s get buf failed! queue %d\n",
+			__func__, queue_id);
+		buf = &rx_pool->page_buf[rx_pool->reserved_start +
+					 queue_id * dma_rx_size + offset];
+		buf_refill->use_reserve = true;
+	}
+	return buf;
+}
+
+#define DN200_BUFCACHED_NEXT_HEAD(h, c) ((h + 1) & c->cache_mask)
+
+#define DN200_BUFCACHED_AVAILD(cache)	((((cache)->tail >= (cache)->head) ? 0 : (cache)->size) + \
+	(ring)->tail - (ring)->head)
+
+static inline int dn200_cache_clean(struct dn200_bufpool *rx_pool, u8 queue_id,
+				    u32 max)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	int stride = rx_pool->buf_num_per_page;
+	struct dn200_buf_cache_ring *buf_cached = &local_cache->buf_cached;
+	u64 *objs;
+	u32 avail = dn200_page_buf_avail(buf_cached);
+
+	max = ALIGN(max, stride);
+	max = min(avail, max);
+	if (max < stride)
+		return 0;
+
+	max = min(max, (buf_cached->cache_size - buf_cached->tail));
+	objs = (u64 *)&buf_cached->objs[buf_cached->tail];
+	dn200_bufring_do_enqueue_elem(rx_pool->cached_ring, (void *)objs, max,
+				      0);
+	buf_cached->tail = (max + buf_cached->tail) & buf_cached->cache_mask;
+
+	return max;
+}
+
+static inline void dn200_rx_pool_buf_free(struct dn200_bufpool *rx_pool,
+					  u8 queue_id,
+					  struct dn200_page_buf *buf)
+{
+	struct dn200_bufpool_cache *local_cache =
+	    &rx_pool->local_cache[queue_id];
+	struct dn200_buf_cache_ring *buf_cached = &local_cache->buf_cached;
+	u64 *cache_objs;
+	u32 head = buf_cached->head;
+	u32 free = 0;
+
+	/*reserved page_buf, do not need to be recycled */
+	if (unlikely(buf->low_res))
+		return;
+
+	(*(buf->page_ref_bias))--;
+	if (unlikely(*(buf->page_ref_bias) == (rx_pool->buf_num_per_page - 1))) {
+		page_ref_add(buf->page, USHRT_MAX - (rx_pool->buf_num_per_page - 1));
+		*(buf->page_ref_bias) = USHRT_MAX;
+	}
+	free = dn200_page_buf_free(buf_cached);
+	if (unlikely(!free)) {
+		if (!dn200_cache_clean(rx_pool, queue_id,
+				       DN200_BUFPOOL_CACHE_MAX_SIZE)) {
+			dev_err(rx_pool->device,
+				"%s free to bufpool cached failed!, cache buf full\n",
+				__func__);
+			return;
+		}
+	}
+
+	cache_objs = (u64 *)&buf_cached->objs[head];
+	head = DN200_BUFCACHED_NEXT_HEAD(head, buf_cached);
+	*cache_objs = (u64)buf;
+	buf_cached->head = head;
+}
+
+struct page *dn200_alloc_dma_page_dir(struct dn200_priv *priv,
+				    dma_addr_t *dma_addr, enum dma_data_direction dma_dir);
+
+void dn200_free_dma_page_dir(struct dn200_priv *priv,
+					struct page *page, dma_addr_t dma_addr,
+					enum dma_data_direction dma_dir);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_prod.h b/drivers/net/ethernet/dapustor/dn200/dn200_prod.h
new file mode 100644
index 000000000000..b44c1572586a
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_prod.h
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ */
+
+#define __DN200_PROD_H__
+
+#define XGE_NUM 4
+#define DN200_PORT_TYPE 2
+#define DN200_NUM_PCB_VER 8
+#define DN200_PORT_NUM 2
+
+/**
+ * bit0-tx_disable,		bit1-tx_fault,		bit4-sfp_loss,
+ * bit5-sfp_mod_detect,	bit6-sfp_rs0,		bit7-sfp_rs1,
+ * bit8-led0 from PWM,	bit9-led1 for 1G,	bit10-led2 for 10G
+ */
+struct dn200_gpio_data
+	gpio_data_prod_type[DN200_PORT_TYPE][DN200_NUM_PCB_VER][DN200_PORT_NUM] = {
+	/*4p is the same */
+	[0] = {
+			[0] = {
+				[0] = {
+					.gpio_addr_offset = DN200_SFPCTRL_MODE0_BAROFF,
+					.sfp_detect_pin = (0 << 16) | 5,	/*0 is valid */
+					.sfp_tx_disable_pin = (0 << 16) | 0,
+					.sfp_tx_fault_pin = (0 << 16) | 1,
+					.sfp_rx_los_pin = (0 << 16) | 4,
+					.sfp_rs0_pin = (0 << 16) | 6,
+					.sfp_rs1_pin = (0 << 16) | 7,
+					.sfp_led1_pin = (0 << 16) | 10,	// low active
+					.sfp_led2_pin = (0 << 16) | 9,
+					.reg_off_set_write = 0xc,
+					.reg_off_set_read = 0x8,
+					},
+				},
+			},
+	/*2p */
+	[1] = {
+			/*2p old pcb */
+			[0] = {
+				/*xgmac0 */
+				[0] = {
+					.gpio_addr_offset = DN200_SFPCTRL_MODE1_BAROFF,
+					.sfp_detect_pin = (0 << 16) | 10,
+					.sfp_tx_disable_pin = (0 << 16) | 7,
+					.sfp_tx_fault_pin = (0 << 16) | 8,
+					.sfp_rx_los_pin = (0 << 16) | 9,
+					.sfp_rs0_pin = (0 << 16) | 11,
+					.sfp_rs1_pin = (0 << 16) | 11,
+					.sfp_led1_pin = (0 << 16) | 21,	// low active
+					.sfp_led2_pin = (0 << 16) | 14,
+					.reg_off_set_write = 0x10,
+					.reg_off_set_read = 0x0,
+					},
+				/*xgmac1 */
+				[1] = {
+					.gpio_addr_offset = DN200_SFPCTRL_MODE1_BAROFF,
+					.sfp_detect_pin = (0 << 16) | 25,
+					.sfp_tx_disable_pin = (0 << 16) | 22,
+					.sfp_tx_fault_pin = (0 << 16) | 23,
+					.sfp_rx_los_pin = (0 << 16) | 24,
+					.sfp_rs0_pin = (0 << 16) | 26,
+					.sfp_rs1_pin = (0 << 16) | 26,
+					.sfp_led1_pin = (0 << 16) | 49,	// low active
+					.sfp_led2_pin = (0 << 16) | 37,
+					.reg_off_set_write = 0x10,
+					.reg_off_set_read = 0x0,
+					},
+				},
+			/*2p new pcb */
+			[1] = {
+				/*xgmac0 */
+				[0] = {
+					.gpio_addr_offset = DN200_SFPCTRL_MODE1_BAROFF,
+					.sfp_detect_pin = (0 << 16) | 14,
+					.sfp_tx_disable_pin = (1 << 16) | 7,
+					.sfp_tx_fault_pin = (1 << 16) | 8,
+					.sfp_rx_los_pin = (1 << 16) | 9,
+					.sfp_rs0_pin = (1 << 16) | 11,
+					.sfp_rs1_pin = (1 << 16) | 11,
+					.sfp_led1_pin = (1 << 16) | 21,	// low active
+					.sfp_led2_pin = (1 << 16) | 10,
+					.reg_off_set_write = 0x10,
+					.reg_off_set_read = 0x0,
+					},
+				/*xgmac1 */
+				[1] = {
+					.gpio_addr_offset = DN200_SFPCTRL_MODE1_BAROFF,
+					.sfp_detect_pin = (0 << 16) | 24,
+					.sfp_tx_disable_pin = (1 << 16) | 22,
+					.sfp_tx_fault_pin = (1 << 16) | 23,
+					.sfp_rx_los_pin = (1 << 16) | 25,
+					.sfp_rs0_pin = (1 << 16) | 26,
+					.sfp_rs1_pin = (1 << 16) | 26,
+					.sfp_led1_pin = (1 << 16) | 32,	// low active
+					.sfp_led2_pin = (1 << 16) | 37,
+					.reg_off_set_write = 0x10,
+					.reg_off_set_read = 0x0,
+					},
+				},
+			},
+};
+
+struct xge_link_config link_config[XGE_LINK_MODE_MAX - 1] = {
+	/*EXTERNAL_PHY_1000BASEX */
+	{
+		.has_xpcs = 0,
+		.clk_ptp_rate = 250000000, // 250MHz for XGMAC, 50MHz for GMAC
+		.clk_csr = 0x8, // DN200_CSR_I_4
+		.phy_interface = PHY_INTERFACE_MODE_1000BASEX,
+		.max_speed = 1000,
+		.clk_ref_rate = 500000000,
+	},
+	/*EXTERNAL_PHY_SGMII */
+	{
+		.has_xpcs = 0,
+		.clk_ptp_rate = 250000000,
+		.clk_csr = 0x8,
+		.phy_interface = PHY_INTERFACE_MODE_SGMII,
+		.max_speed = 1000,
+		.clk_ref_rate = 500000000,
+	},
+	/*EXTERNAL_PHY_RGMII */
+	{
+		.has_xpcs = 0,
+		.clk_ptp_rate = 250000000,
+		.clk_csr = 0x5, /*Clock Range 400-500M, refer to MDIO_Single_Command_Control_Data */
+		.phy_interface = PHY_INTERFACE_MODE_RGMII_ID,
+		.max_speed = 1000,
+		.clk_ref_rate = 500000000,
+	},
+	/*XPCS_PHY_1000BASEX */
+	{
+		.has_xpcs = 1,
+		.clk_ptp_rate = 250000000,
+		.clk_csr = 0x8,
+		.phy_interface = PHY_INTERFACE_MODE_GMII,
+		.max_speed = 1000,
+		.clk_ref_rate = 500000000,
+	},
+	/*XPCS_PHY_10GBASER */
+	{
+		.has_xpcs = 1,
+		.clk_ptp_rate = 250000000,
+		.clk_csr = 0x5,
+		.phy_interface = PHY_INTERFACE_MODE_XGMII,
+		.max_speed = 10000,
+		.clk_ref_rate = 500000000,
+	},
+};
+
+const struct xge_private_data xge_private_data_table[XGE_NUM] = {
+	[0] = {
+			/* phy address is just used by external MDIO phy,
+			 * xpcs do not use it
+			 */
+			.phy_addr = 1,
+			.bus_id = 1,
+			.tx_queues_to_use = XGE01_QUEUES_TO_USE,
+			.rx_queues_to_use = XGE01_QUEUES_TO_USE,
+			.rx_queues_reserved = 8,
+			.tx_queues_reserved = 8,
+			.max_vfs = 8,
+			.rx_queues_total = 16,
+			.tx_queues_total = 16,
+			.rx_used_mtl_queues = 4,
+			},
+	[1] = {
+			.phy_addr = 2,
+			.bus_id = 2,
+			.tx_queues_to_use = XGE01_QUEUES_TO_USE,
+			.rx_queues_to_use = XGE01_QUEUES_TO_USE,
+			.rx_queues_reserved = 8,
+			.tx_queues_reserved = 8,
+			.max_vfs = 8,
+			.rx_queues_total = 16,
+			.tx_queues_total = 16,
+			.rx_used_mtl_queues = 4,
+			},
+	[2] = {
+			.phy_addr = 3,
+			.bus_id = 3,
+			.tx_queues_to_use = XGE23_QUEUES_TO_USE,
+			.rx_queues_to_use = XGE23_QUEUES_TO_USE,
+			.rx_queues_reserved = 1,
+			.tx_queues_reserved = 1,
+			.max_vfs = 1,
+			.rx_queues_total = 2,
+			.tx_queues_total = 2,
+			.rx_used_mtl_queues = 2,
+			},
+	[3] = {
+			.phy_addr = 4,
+			.bus_id = 4,
+			.tx_queues_to_use = XGE23_QUEUES_TO_USE,
+			.rx_queues_to_use = XGE23_QUEUES_TO_USE,
+			.rx_queues_reserved = 1,
+			.tx_queues_reserved = 1,
+			.max_vfs = 1,
+			.rx_queues_total = 2,
+			.tx_queues_total = 2,
+			.rx_used_mtl_queues = 2,
+			},
+};
+
+struct pci_device_map_t {
+	int pci_funcid[4];
+	int xpcs_index[4];
+	int pf_deviceid;
+	int vf_deviceid;
+	int max_pf;
+	int min_vf_funcid[4];
+	int max_vf_funcid[4];
+	int sriov_supported;
+	int nvme_supported;
+	u8 pf_max_iatu[4]; /* max iatu of each pf */
+	u8 vf_total_iatu[4]; /* vf total iatu of each pf */
+	struct xge_link_config *link_config;
+	bool raid_supported;
+	bool upgrade_with_flowing;
+};
+
+static int dn200_pcb_id_map2_gpio_type[8] = { 0, 1, 1 };
+
+#define DN200_2P_MIN_PCB_ID 0
+#define DN200_2P_MAX_PCB_ID 2
+struct pci_device_map_t device_map[] = {
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_4P_SRIOV_PF,
+	 .vf_deviceid = DN200_DEV_ID_SFP_10G_4P_SRIOV_VF,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3,
+			},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   0x4, 0x13, 0x22, 0x23,
+			   },
+	 .max_vf_funcid = {
+			   0x12, 0x21, 0x22, 0x23,
+			   },
+	 .sriov_supported = 1,
+	 .nvme_supported = true,
+	 .pf_max_iatu = {
+			 4, 4, 4, 4},
+	 .vf_total_iatu = {
+			   8, 8, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_4P_PURE_PF,
+	 .vf_deviceid = 0,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   -1, -1, -1, -1,
+			   },
+	 .max_vf_funcid = {
+			   -1, -1, -1, -1,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 0,
+	 .pf_max_iatu = {
+			 8, 8, 8, 8},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_2P_SRIOV_PF,
+	 .vf_deviceid = DN200_DEV_ID_SFP_10G_2P_SRIOV_VF,
+	 .max_pf = 2,
+	 .pci_funcid = {
+			0, 1,
+			},
+	 .xpcs_index = {
+			0, 1,
+			},
+	 .min_vf_funcid = {
+			   0x2, 0xa,
+			   },
+	 .max_vf_funcid = {
+			   0x9, 0x11,
+			   },
+	 .sriov_supported = 1,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 8, 8, 0, 0},
+	 .vf_total_iatu = {
+			   8, 8, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_2P_PURE_PF,
+	 .vf_deviceid = 0,
+	 .max_pf = 2,
+	 .pci_funcid = {
+			0, 1, 2, 3,
+			},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   0x2, 0x11,
+			   },
+	 .max_vf_funcid = {
+			   0x10, 0x1f,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 0,
+	 .pf_max_iatu = {
+			 0, 0, 0, 0},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_1G_4P_SRIOV_PF,
+	 .vf_deviceid = DN200_DEV_ID_SFP_1G_4P_SRIOV_VF,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   0x4, 0x13, 0x22, 0x23,
+			   },
+	 .max_vf_funcid = {
+			   0x12, 0x21, 0x22, 0x23,
+			   },
+	 .sriov_supported = 1,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 4, 4, 4, 4},
+	 .vf_total_iatu = {
+			   8, 8, 0, 0},
+	 .link_config = &link_config[EXTERNAL_PHY_1000BASEX],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_1G_4P_PURE_PF,
+	 .vf_deviceid = 0,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   0x4, 0x13, 0x22, 0x23,
+			   },
+	 .max_vf_funcid = {
+			   0x12, 0x21, 0x22, 0x23,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 0,
+	 .pf_max_iatu = {
+			 8, 8, 8, 8},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[EXTERNAL_PHY_1000BASEX],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_2P_NVME_PUREPF,
+	 .vf_deviceid = 0,
+	 .max_pf = 2,
+	 .pci_funcid = {
+			0, 1},
+	 .xpcs_index = {
+			0, 1},
+	 .min_vf_funcid = {
+			   0, 0, 0, 0,
+			   },
+	 .max_vf_funcid = {
+			   0, 0, 0, 0,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 16, 16, 0, 0},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF,
+	 .vf_deviceid = 0,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3},
+	 .xpcs_index = {
+			0, 1, 2, 3,
+			},
+	 .min_vf_funcid = {
+			   0, 0, 0, 0,
+			   },
+	 .max_vf_funcid = {
+			   0, 0, 0, 0,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 8, 8, 8, 8},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = false,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_PF,
+	 .vf_deviceid = DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_VF,
+	 .max_pf = 2,
+	 .pci_funcid = {
+			0, 1,
+			},
+	 .xpcs_index = {
+			0, 1,
+			},
+	 .min_vf_funcid = {
+			   0x3, 0xb,
+			   },
+	 .max_vf_funcid = {
+			   0xa, 0x12,
+			   },
+	 .sriov_supported = 1,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 8, 8, 0, 0},
+	 .vf_total_iatu = {
+			   8, 8, 0, 0},
+	 .link_config = &link_config[XPCS_PHY_10GBASER],
+	 .raid_supported = true,
+	 .upgrade_with_flowing = false,
+	  },
+
+	{
+	 .pf_deviceid = DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF,
+	 .vf_deviceid = 0,
+	 .max_pf = 4,
+	 .pci_funcid = {
+			0, 1, 2, 3},
+	 .xpcs_index = {
+			-1, -1, -1, -1,
+			},
+	 .min_vf_funcid = {
+			   -1, -1, -1, -1,
+			   },
+	 .max_vf_funcid = {
+			   -1, -1, -1, -1,
+			   },
+	 .sriov_supported = 0,
+	 .nvme_supported = 1,
+	 .pf_max_iatu = {
+			 8, 8, 8, 8},
+	 .vf_total_iatu = {
+			   0, 0, 0, 0},
+	 .link_config = &link_config[EXTERNAL_PHY_RGMII],
+	 .raid_supported = true,
+	 .upgrade_with_flowing = true,
+	  },
+
+};
+
+static int PURE_PF_DEVICE[] = {
+	DN200_DEV_ID_SFP_10G_2P_PURE_PF,
+	DN200_DEV_ID_SFP_10G_4P_PURE_PF,
+	DN200_DEV_ID_SFP_1G_4P_PURE_PF,
+	DN200_DEV_ID_COPP_1G_4P_PURE_PF,
+};
+
+static int NVME_PURE_PF_DEVICE[] = {
+	DN200_DEV_ID_SFP_10G_2P_NVME_PUREPF,
+	DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF,
+	DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF,
+};
+
+static int SRIOV_VF_DEVICE[] = {
+	DN200_DEV_ID_SFP_10G_2P_SRIOV_VF,
+	DN200_DEV_ID_SFP_10G_4P_SRIOV_VF,
+	DN200_DEV_ID_SFP_1G_4P_SRIOV_VF,
+	DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_VF,
+};
+
+static int SRIOV_4P_DEVICE[] = {
+	DN200_DEV_ID_SFP_10G_4P_SRIOV_PF,
+	DN200_DEV_ID_SFP_10G_4P_PURE_PF,
+	DN200_DEV_ID_SFP_1G_4P_SRIOV_PF,
+	DN200_DEV_ID_SFP_1G_4P_PURE_PF,
+	DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF,
+};
+
+static int EXTERN_PHY_DEVICE[] = {
+	DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF,
+};
+/*address limit for different product or driver*/
+struct dn200_addr_limit {
+	int drv_type;
+	int addr_bits_limit;
+	u64 addr_forbid_bits;
+} addr_limit[] = {
+	{
+	 .drv_type = DRV_PURE_PF,
+	 .addr_bits_limit = 40,	/* support 1TB memory normal memory access */
+	 .addr_forbid_bits = DN200_BASE_IATU_ADDR,
+	},
+	{
+	 .drv_type = DRV_SRIOV_PF, /* highest 3 bits used for iATU VF high addr map */
+	 .addr_bits_limit = 40,
+	 .addr_forbid_bits = DN200_BASE_IATU_ADDR,	/* higher 3 bits of 40bits used for iATU VF map */
+	},
+	{
+	 .drv_type = DRV_VF,
+	 .addr_bits_limit = 32,	/* vf just support dma32 memory access */
+	 .addr_forbid_bits = 0,
+	},
+};
+
+#define DN200_FIRST_VF_FUNC_ID	0x04
+#define DN200_IATU_ADDR_START	0xE0
+
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_ptp.c b/drivers/net/ethernet/dapustor/dn200/dn200_ptp.c
new file mode 100644
index 000000000000..a013dd977363
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_ptp.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+#include "dn200.h"
+#include "dn200_ptp.h"
+
+/**
+ * dn200_adjust_freq
+ *
+ * @ptp: pointer to ptp_clock_info structure
+ * @scaled_ppm: desired period change in scaled parts per million
+ *
+ * Description: this function will adjust the frequency of hardware clock.
+ *
+ * Scaled parts per million is ppm with a 16-bit binary fractional field.
+ */
+static int dn200_adjust_freq(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+	unsigned long flags;
+	u32 addend;
+
+	addend = adjust_by_scaled_ppm(priv->default_addend, scaled_ppm);
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	dn200_config_addend(priv, priv->ptpaddr, addend);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	return 0;
+}
+
+static struct timespec64 dn200_calc_tas_basetime(ktime_t old_base_time,
+					  ktime_t current_time, u64 cycle_time)
+{
+	struct timespec64 time;
+
+	if (ktime_after(old_base_time, current_time)) {
+		time = ktime_to_timespec64(old_base_time);
+	} else {
+		s64 n;
+		ktime_t base_time;
+		ktime_t timetest;
+
+		timetest = ktime_sub_ns(current_time, old_base_time);
+		n = div64_s64(timetest, cycle_time);
+		base_time = ktime_add_ns(old_base_time, (n + 1) * cycle_time);
+
+		time = ktime_to_timespec64(base_time);
+	}
+
+	return time;
+}
+
+/**
+ * dn200_adjust_time
+ *
+ * @ptp: pointer to ptp_clock_info structure
+ * @delta: desired change in nanoseconds
+ *
+ * Description: this function will shift/adjust the hardware clock time.
+ */
+static int dn200_adjust_time(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+	unsigned long flags;
+	u32 sec, nsec;
+	u32 quotient, reminder;
+	int neg_adj = 0;
+	bool xmac, est_rst = false;
+	int ret;
+
+	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+
+	quotient = div_u64_rem(delta, 1000000000ULL, &reminder);
+	sec = quotient;
+	nsec = reminder;
+
+	/* If EST is enabled, disabled it before adjust ptp time. */
+	if (priv->plat->est && priv->plat->est->enable) {
+		est_rst = true;
+		mutex_lock(&priv->plat->est->lock);
+		priv->plat->est->enable = false;
+		dn200_est_configure(priv, priv->ioaddr, priv->plat->est,
+				    priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+	}
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	dn200_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	/* Caculate new basetime and re-configured EST after PTP time adjust. */
+	if (est_rst) {
+		struct timespec64 current_time, time;
+		ktime_t current_time_ns, basetime;
+		u64 cycle_time;
+
+		mutex_lock(&priv->plat->est->lock);
+		priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops,
+					      &current_time);
+		current_time_ns = timespec64_to_ktime(current_time);
+		time.tv_nsec = priv->plat->est->btr_reserve[0];
+		time.tv_sec = priv->plat->est->btr_reserve[1];
+		basetime = timespec64_to_ktime(time);
+		cycle_time = (u64) priv->plat->est->ctr[1] * NSEC_PER_SEC +
+		    priv->plat->est->ctr[0];
+		time = dn200_calc_tas_basetime(basetime,
+					       current_time_ns, cycle_time);
+
+		priv->plat->est->btr[0] = (u32) time.tv_nsec;
+		priv->plat->est->btr[1] = (u32) time.tv_sec;
+		priv->plat->est->enable = true;
+		ret = dn200_est_configure(priv, priv->ioaddr, priv->plat->est,
+					  priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+		if (ret)
+			netdev_err(priv->dev, "failed to configure EST\n");
+	}
+
+	return 0;
+}
+
+/**
+ * dn200_get_time
+ *
+ * @ptp: pointer to ptp_clock_info structure
+ * @ts: pointer to hold time/result
+ *
+ * Description: this function will read the current time from the
+ * hardware clock and store it in @ts.
+ */
+static int dn200_get_time(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+	unsigned long flags;
+	u64 ns = 0;
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	dn200_get_systime(priv, priv->ptpaddr, &ns);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+/**
+ * dn200_set_time
+ *
+ * @ptp: pointer to ptp_clock_info structure
+ * @ts: time value to set
+ *
+ * Description: this function will set the current time on the
+ * hardware clock.
+ */
+static int dn200_set_time(struct ptp_clock_info *ptp,
+			  const struct timespec64 *ts)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	dn200_init_systime(priv, priv->ptpaddr, ts->tv_sec, ts->tv_nsec);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	return 0;
+}
+
+static int dn200_enable(struct ptp_clock_info *ptp,
+			struct ptp_clock_request *rq, int on)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+	void __iomem *ptpaddr = priv->ptpaddr;
+	void __iomem *ioaddr = priv->hw->pcsr;
+	struct dn200_pps_cfg *cfg;
+	u32 intr_value, acr_value;
+	int ret = -EOPNOTSUPP;
+	unsigned long flags;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		/* Reject requests with unsupported flags */
+		if (rq->perout.flags)
+			return -EOPNOTSUPP;
+
+		cfg = &priv->pps[rq->perout.index];
+
+		cfg->start.tv_sec = rq->perout.start.sec;
+		cfg->start.tv_nsec = rq->perout.start.nsec;
+		cfg->period.tv_sec = rq->perout.period.sec;
+		cfg->period.tv_nsec = rq->perout.period.nsec;
+
+		spin_lock_irqsave(&priv->ptp_lock, flags);
+		ret = dn200_flex_pps_config(priv, priv->ioaddr,
+					    rq->perout.index, cfg, on,
+					    priv->sub_second_inc,
+					    priv->systime_flags);
+		spin_unlock_irqrestore(&priv->ptp_lock, flags);
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		priv->plat->ext_snapshot_en = on;
+		mutex_lock(&priv->aux_ts_lock);
+		acr_value = readl(ptpaddr + PTP_ACR);
+		acr_value &= ~PTP_ACR_MASK;
+		if (on) {
+			/* Enable External snapshot trigger */
+			acr_value |= priv->plat->ext_snapshot_num;
+			acr_value |= PTP_ACR_ATSFC;
+			netdev_dbg(priv->dev,
+				   "Auxiliary Snapshot %d enabled.\n",
+				   priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT);
+			/* Enable Timestamp Interrupt */
+			intr_value = readl(ioaddr + XGMAC_INT_EN);
+			intr_value |= XGMAC_TSIE;
+			writel(intr_value, ioaddr + XGMAC_INT_EN);
+
+		} else {
+			netdev_dbg(priv->dev,
+				   "Auxiliary Snapshot %d disabled.\n",
+				   priv->plat->ext_snapshot_num >> PTP_ACR_ATSEN_SHIFT);
+			/* Disable Timestamp Interrupt */
+			intr_value = readl(ioaddr + XGMAC_INT_EN);
+			intr_value &= ~XGMAC_TSIE;
+			writel(intr_value, ioaddr + XGMAC_INT_EN);
+		}
+		writel(acr_value, ptpaddr + PTP_ACR);
+		mutex_unlock(&priv->aux_ts_lock);
+		ret = 0;
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * dn200_get_syncdevicetime
+ * @device: current device time
+ * @system: system counter value read synchronously with device time
+ * @ctx: context provided by timekeeping code
+ * Description: Read device and system clock simultaneously and return the
+ * corrected clock values in ns.
+ **/
+static int dn200_get_syncdevicetime(ktime_t *device,
+				    struct system_counterval_t *system,
+				    void *ctx)
+{
+	return -EOPNOTSUPP;
+}
+
+static int dn200_getcrosststamp(struct ptp_clock_info *ptp,
+				struct system_device_crosststamp *xtstamp)
+{
+	struct dn200_priv *priv =
+	    container_of(ptp, struct dn200_priv, ptp_clock_ops);
+
+	return get_device_system_crosststamp(dn200_get_syncdevicetime,
+					     priv, NULL, xtstamp);
+}
+
+/* structure describing a PTP hardware clock */
+static struct ptp_clock_info dn200_ptp_clock_ops = {
+	.owner = THIS_MODULE,
+	.name = "dn200 ptp",
+	.max_adj = 62500000,
+	.n_alarm = 0,
+	.n_ext_ts = 0,		/* will be overwritten in dn200_ptp_register */
+	.n_per_out = 0,		/* will be overwritten in dn200_ptp_register */
+	.n_pins = 0,
+	.pps = 0,
+	.adjfine = dn200_adjust_freq,
+	.adjtime = dn200_adjust_time,
+	.gettime64 = dn200_get_time,
+	.settime64 = dn200_set_time,
+	.enable = dn200_enable,
+	.getcrosststamp = dn200_getcrosststamp,
+};
+
+/**
+ * dn200_ptp_register
+ * @priv: driver private structure
+ * Description: this function will register the ptp clock driver
+ * to kernel. It also does some house keeping work.
+ */
+void dn200_ptp_register(struct dn200_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->dma_cap.pps_out_num; i++) {
+		if (i >= DN200_PPS_MAX)
+			break;
+		priv->pps[i].available = true;
+	}
+
+	if (priv->plat->ptp_max_adj)
+		dn200_ptp_clock_ops.max_adj = priv->plat->ptp_max_adj;
+
+	dn200_ptp_clock_ops.n_per_out = priv->dma_cap.pps_out_num;
+	dn200_ptp_clock_ops.n_ext_ts = priv->dma_cap.aux_snapshot_n;
+
+	spin_lock_init(&priv->ptp_lock);
+	mutex_init(&priv->aux_ts_lock);
+	priv->ptp_clock_ops = dn200_ptp_clock_ops;
+
+	priv->ptp_clock = ptp_clock_register(&priv->ptp_clock_ops,
+					     priv->device);
+	if (IS_ERR(priv->ptp_clock)) {
+		netdev_err(priv->dev, "register PTP clock failed\n");
+		priv->ptp_clock = NULL;
+	}
+}
+
+/**
+ * dn200_ptp_unregister
+ * @priv: driver private structure
+ * Description: this function will remove/unregister the ptp clock driver
+ * from the kernel.
+ */
+void dn200_ptp_unregister(struct dn200_priv *priv)
+{
+	if (priv->ptp_clock) {
+		ptp_clock_unregister(priv->ptp_clock);
+		priv->ptp_clock = NULL;
+		pr_debug("Removed PTP HW clock successfully on %s\n",
+			 priv->dev->name);
+	}
+
+	mutex_destroy(&priv->aux_ts_lock);
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_ptp.h b/drivers/net/ethernet/dapustor/dn200/dn200_ptp.h
new file mode 100644
index 000000000000..fc4c4d78d0af
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_ptp.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#define	__DN200_PTP_H__
+
+#define PTP_XGMAC_OFFSET	0xd00
+#define	PTP_GMAC4_OFFSET	0xb00
+#define	PTP_GMAC3_X_OFFSET	0x700
+
+/* IEEE 1588 PTP register offsets */
+#define	PTP_TCR		0x00	/* Timestamp Control Reg */
+#define	PTP_SSIR	0x04	/* Sub-Second Increment Reg */
+#define	PTP_STSR	0x08	/* System Time – Seconds Regr */
+#define	PTP_STNSR	0x0c	/* System Time – Nanoseconds Reg */
+#define	PTP_STSUR	0x10	/* System Time – Seconds Update Reg */
+#define	PTP_STNSUR	0x14	/* System Time – Nanoseconds Update Reg */
+#define	PTP_TAR		0x18	/* Timestamp Addend Reg */
+#define	PTP_ACR		0x40	/* Auxiliary Control Reg */
+#define	PTP_ATNR	0x48	/* Auxiliary Timestamp - Nanoseconds Reg */
+#define	PTP_ATSR	0x4c	/* Auxiliary Timestamp - Seconds Reg */
+
+#define	PTP_STNSUR_ADDSUB_SHIFT	31
+#define	PTP_DIGITAL_ROLLOVER_MODE	0x3B9ACA00	/* 10e9-1 ns */
+#define	PTP_BINARY_ROLLOVER_MODE	0x80000000	/* ~0.466 ns */
+
+/* PTP Timestamp control register defines */
+#define	PTP_TCR_TSENA		BIT(0)	/* Timestamp Enable */
+#define	PTP_TCR_TSCFUPDT	BIT(1)	/* Timestamp Fine/Coarse Update */
+#define	PTP_TCR_TSINIT		BIT(2)	/* Timestamp Initialize */
+#define	PTP_TCR_TSUPDT		BIT(3)	/* Timestamp Update */
+#define	PTP_TCR_TSTRIG		BIT(4)	/* Timestamp Interrupt Trigger Enable */
+#define	PTP_TCR_TSADDREG	BIT(5)	/* Addend Reg Update */
+#define	PTP_TCR_TSENALL		BIT(8)	/* Enable Timestamp for All Frames */
+#define	PTP_TCR_TSCTRLSSR	BIT(9)	/* Digital or Binary Rollover Control */
+/* Enable PTP packet Processing for Version 2 Format */
+#define	PTP_TCR_TSVER2ENA	BIT(10)
+/* Enable Processing of PTP over Ethernet Frames */
+#define	PTP_TCR_TSIPENA		BIT(11)
+/* Enable Processing of PTP Frames Sent over IPv6-UDP */
+#define	PTP_TCR_TSIPV6ENA	BIT(12)
+/* Enable Processing of PTP Frames Sent over IPv4-UDP */
+#define	PTP_TCR_TSIPV4ENA	BIT(13)
+/* Enable Timestamp Snapshot for Event Messages */
+#define	PTP_TCR_TSEVNTENA	BIT(14)
+/* Enable Snapshot for Messages Relevant to Master */
+#define	PTP_TCR_TSMSTRENA	BIT(15)
+/* Select PTP packets for Taking Snapshots
+ * On gmac4 specifically:
+ * Enable SYNC, Pdelay_Req, Pdelay_Resp when TSEVNTENA is enabled.
+ * or
+ * Enable  SYNC, Follow_Up, Delay_Req, Delay_Resp, Pdelay_Req, Pdelay_Resp,
+ * Pdelay_Resp_Follow_Up if TSEVNTENA is disabled
+ */
+#define	PTP_TCR_SNAPTYPSEL_1	BIT(16)
+/* Enable MAC address for PTP Frame Filtering */
+#define	PTP_TCR_TSENMACADDR	BIT(18)
+
+/* SSIR defines */
+#define	PTP_SSIR_SSINC_MASK		0xff
+#define	GMAC4_PTP_SSIR_SSINC_SHIFT	16
+
+/* Auxiliary Control defines */
+#define	PTP_ACR_ATSFC		BIT(0)	/* Auxiliary Snapshot FIFO Clear */
+#define	PTP_ACR_ATSEN0		BIT(4)	/* Auxiliary Snapshot 0 Enable */
+#define	PTP_ACR_ATSEN1		BIT(5)	/* Auxiliary Snapshot 1 Enable */
+#define	PTP_ACR_ATSEN2		BIT(6)	/* Auxiliary Snapshot 2 Enable */
+#define	PTP_ACR_ATSEN3		BIT(7)	/* Auxiliary Snapshot 3 Enable */
+#define	PTP_ACR_ATSEN_SHIFT	5	/* Auxiliary Snapshot shift */
+#define	PTP_ACR_MASK		GENMASK(7, 4)	/* Aux Snapshot Mask */
+#define	PMC_ART_VALUE0		0x01	/* PMC_ART[15:0] timer value */
+#define	PMC_ART_VALUE1		0x02	/* PMC_ART[31:16] timer value */
+#define	PMC_ART_VALUE2		0x03	/* PMC_ART[47:32] timer value */
+#define	PMC_ART_VALUE3		0x04	/* PMC_ART[63:48] timer value */
+#define	GMAC4_ART_TIME_SHIFT	16	/* ART TIME 16-bits shift */
+
+enum aux_snapshot {
+	AUX_SNAPSHOT0 = 0x10,
+	AUX_SNAPSHOT1 = 0x20,
+	AUX_SNAPSHOT2 = 0x40,
+	AUX_SNAPSHOT3 = 0x80,
+};
+
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_reg.c b/drivers/net/ethernet/dapustor/dn200/dn200_reg.c
new file mode 100644
index 000000000000..7b698077a34a
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_reg.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024 Dapustor Corporation .
+ * dn200 features(e.g. iatu) register read write.
+ */
+
+#include "dn200.h"
+#include "dwxgmac_comm.h"
+#include "dn200_iatu.h"
+#include "dn200_reg.h"
+
+void dn200_iatu_tbl_entry_write(void __iomem *ioaddr,
+				struct dn200_iatu_tbl_entry *iatu_entry,
+				u8 iatu_index, bool enable)
+{
+	u32 type = 0;
+	u32 region_off = (u32)(iatu_index * 0x200);
+	void __iomem *iatu_reg_base = ioaddr + region_off;
+	u64 base_addr = iatu_entry->base_addr;
+	u8 pf_nmb = iatu_entry->pf_id;
+	bool is_vf = iatu_entry->is_vf;
+	u8 vf_nmb = iatu_entry->vf_offset;
+	u32 limit = iatu_entry->limit_mask;
+	u64 tgt_addr = iatu_entry->tgt_addr;
+
+	pr_debug("PF[%d].VF[%d] Configure Outbound: region:%d, base 0x%llx to target 0x%llx limit 0x%llx\n",
+		iatu_entry->pf_id, iatu_entry->vf_offset, iatu_index,
+		iatu_entry->base_addr, iatu_entry->tgt_addr,
+		iatu_entry->limit_mask);
+
+	if (iatu_index >= IATU_IATU_MAX_REGION) {
+		pr_err("HW supports only max 32 address region, err occur in iatu write!\n");
+		return;
+	}
+	if (!enable)
+		writel(0x7fffffff & readl(iatu_reg_base + 0x04), iatu_reg_base + 0x04);
+	/* 1. Setup the Region Base and Limit Address Registers. */
+	writel((u32)(base_addr), iatu_reg_base + 0x08);
+	writel((u32)(base_addr >> 32), iatu_reg_base + 0x0C);
+	writel((u32)limit, iatu_reg_base + 0x10);
+
+	/* 2. Setup the Target Address Registers. */
+	writel((u32)(tgt_addr), iatu_reg_base + 0x14);
+	writel((u32)(tgt_addr >> 32), iatu_reg_base + 0x18);
+
+	/* 3. Configure the region through the Region Control 1 Register. */
+	writel(((pf_nmb << 20) & GENMASK(22, 20)) + (type & IATU_OB_TYPE),
+	       iatu_reg_base + 0x00);
+	if (is_vf)
+		writel((1 << 31) + vf_nmb, iatu_reg_base + 0x1C);
+
+	/* 4. Enable the region and set BDF from application. */
+	if (enable)
+		writel(IATU_OB_REGION_EN, iatu_reg_base + 0x04);
+}
+
+void dn200_iatu_tgt_addr_updt(void __iomem *ioaddr, u8 iatu_index,
+			      u64 tgt_addr)
+{
+	u32 region_off = (u32) (iatu_index * 0x200);
+	void __iomem *iatu_reg_base = ioaddr + region_off;
+
+	pr_debug("%s, %d, Configure Outbound: iatu_index:%d, target 0x%llx\n",
+		 __func__, __LINE__, iatu_index, tgt_addr);
+
+	if (iatu_index >= IATU_IATU_MAX_REGION) {
+		pr_info("HW supports only max 32 address region, err occur in iatu update!\n");
+		return;
+	}
+
+	/* Setup the Target Address Registers. */
+	writel((u32)(tgt_addr), iatu_reg_base + 0x14);
+	writel((u32)(tgt_addr >> 32), iatu_reg_base + 0x18);
+}
+
+void dn200_enable_tx_dma_irq(void __iomem *ioaddr, u32 chan,
+				    struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = XGMAC_DMA_INT_DEFAULT_EN | XGMAC_DMA_INT_DEFAULT_TX;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+}
+
+void dn200_disable_tx_dma_irq(void __iomem *ioaddr, u32 chan,
+				     struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = XGMAC_DMA_INT_DEFAULT_EN & (~XGMAC_DMA_INT_DEFAULT_TX);
+
+	writel(value, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_reg.h b/drivers/net/ethernet/dapustor/dn200/dn200_reg.h
new file mode 100644
index 000000000000..afb1bd6372da
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_reg.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ */
+#ifndef __DN200_REG_H__
+#define __DN200_REG_H__
+#include "common.h"
+
+#define IATU_IATU_MAX_REGION    32
+#define IATU_OB_TYPE            GENMASK(4, 0)
+#define IATU_OB_REGION_EN       BIT(31)
+#define IATU_OB_FUNC_BYPASS     BIT(19)
+
+void dn200_iatu_tbl_entry_write(void __iomem *ioaddr,
+				struct dn200_iatu_tbl_entry *iatu_entry,
+				u8 iatu_index, bool enable);
+void dn200_iatu_tgt_addr_updt(void __iomem *ioaddr, u8 iatu_index,
+			      u64 tgt_addr);
+
+void dn200_enable_tx_dma_irq(void __iomem *ioaddr, u32 chan,
+				    struct mac_device_info *hw);
+void dn200_disable_tx_dma_irq(void __iomem *ioaddr, u32 chan,
+				     struct mac_device_info *hw);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_self.h b/drivers/net/ethernet/dapustor/dn200/dn200_self.h
new file mode 100644
index 000000000000..f760ccf6565a
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_self.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ */
+#ifndef __DN200_SELF_H__
+#define __DN200_SELF_H__
+
+#include "dn200.h"
+#include "dwxgmac_comm.h"
+#include "dn200_phy.h"
+#include "dn200_sriov.h"
+#include "dn200_ctrl.h"
+
+struct pcb_type {
+	u8 index;
+	u8 value;
+};
+
+enum dn200_hw_pcb_gpio_val {
+	DN200_HW_PCB_GPIO_VAL_0 = 0b000,
+	DN200_HW_PCB_GPIO_VAL_1 = 0b001,
+	DN200_HW_PCB_GPIO_VAL_2 = 0b010,
+	DN200_HW_PCB_GPIO_VAL_3 = 0b011,
+};
+
+enum dn200_hw_pcb_type {
+	DN200_HW_PCB_TYPE_0 = 0x0,
+	DN200_HW_PCB_TYPE_1 = 0x1,
+	DN200_HW_PCB_TYPE_2 = 0x2,
+	DN200_HW_PCB_TYPE_3 = 0x3,
+};
+/* DAPUSOTR vendor and device ID */
+#define PCI_VENDOR_ID_DAPUSTOR 0x1E3B
+#define DN200_DEV_ID_SFP_10G_2P_PURE_PF 0x3000
+#define DN200_DEV_ID_SFP_10G_2P_SRIOV_PF 0x3001
+#define DN200_DEV_ID_SFP_10G_2P_SRIOV_VF 0x3002
+#define DN200_DEV_ID_SFP_10G_4P_PURE_PF 0x3003
+#define DN200_DEV_ID_SFP_10G_4P_SRIOV_PF 0x3004
+#define DN200_DEV_ID_SFP_10G_4P_SRIOV_VF 0x3005
+#define DN200_DEV_ID_SFP_1G_4P_PURE_PF 0x3006
+#define DN200_DEV_ID_SFP_1G_4P_SRIOV_PF 0x3007
+#define DN200_DEV_ID_SFP_1G_4P_SRIOV_VF 0x3008
+#define DN200_DEV_ID_COPP_1G_4P_PURE_PF 0x3009
+#define DN200_DEV_ID_SFP_10G_2P_NVME_PUREPF 0x300A
+#define DN200_DEV_ID_SFP_10G_4P_NVME_PUREPF 0x300B
+#define DN200_DEV_ID_COPP_1G_4P_NVME_PUREPF 0x300C
+#define DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_PF 0x3100
+#define DN200_DEV_ID_SFP_10G_2P_RAID_SRIOV_VF 0x3101
+
+#define DN200_FAILURE (-1)
+#define INVALID_FUNCID (-2)
+
+#define SRIOV_LRAM_BAR_OFF 4
+
+/*dwmac-dn200.c*/
+#define DN200_SFPCTRL_MODE0_BAROFF 0x40030
+#define DN200_SFPCTRL_MODE1_BAROFF 0x40000
+#define XGE_TOP_CONFIG_OFFSET 0x30000
+#define DN200_PCIE_BAROFF 0x20000
+
+#define XGE_MSI_CONFIG_OFFSET (XGE_TOP_CONFIG_OFFSET + 0x600)
+
+#define XGE_XGMAC_CLK_MUX_CTRL(x) (0xC + XGE_TOP_CONFIG_OFFSET + (x) * 0x50)
+#define XGE_XGMAC_CLK_TX_CTRL(x) (0x10 + XGE_TOP_CONFIG_OFFSET + (x) * 0x50)
+#define XGE_XGMAC_CLK_MUX_ENABLE_CTRL(x) (0x1C + XGE_TOP_CONFIG_OFFSET + (x) * 0x50)
+#define XGE_XGMAC_XPCS_SW_RST(x) (0x34 + XGE_TOP_CONFIG_OFFSET + (x) * 0x50)
+
+#define XGE_PCIE_PCS_PHY1_TX_RATE_REG(x) (0x2440 + DN200_PCIE_BAROFF - (x) * 0xd0)
+#define XGE_PCIE_PCS_PHY1_RX_RATE_REG(x) (0x2130 + DN200_PCIE_BAROFF - (x) * 0x110)
+
+#define XGE_MSI_INTR_MASK_LOW(x) (0x10 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_MASK_HIGH(x) (0x14 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_EN_LOW(x) (0x18 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_EN_HIGH(x) (0x1C + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+
+#define XGE_MSIX_INTR_EN_LOW(x) (0x20 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSIX_INTR_EN_HIGH(x) (0x24 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSIX_INTR_MASK_LOW(x) (0x28 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSIX_INTR_MASK_HIGH(x) (0x2C + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+
+#define XGE_INTR_INFO_CONFIG(x) (0x30 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+
+#define XGE_MSI_INTR_SRCMAP0_4(x) (0x34 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP5_9(x) (0x38 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP10_14(x) (0x3C + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP15_19(x) (0x40 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP20_24(x) (0x44 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP25_29(x) (0x48 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP30_34(x) (0x4C + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP35_39(x) (0x50 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP40_44(x) (0x54 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP45_49(x) (0x58 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP50_54(x) (0x5C + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP55_59(x) (0x60 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+#define XGE_MSI_INTR_SRCMAP60_64(x) (0x64 + XGE_MSI_CONFIG_OFFSET + (x) * 0x80)
+
+#define PCIE_PHY1_REFA_CLK_SEL_REG  0x17b8
+#define PCIE_PHY1_REFB_CLK_SEL_REG  0x17c4
+#define PCIE_PHY0_REFA_CLKDET_EN_REG  0x1794
+#define PCIE_PHY0_REFB_CLKDET_EN_REG  0x17a0
+#define PCIE_PHY1_REFA_CLKDET_EN_REG  0x17bc
+#define PCIE_PHY1_REFB_CLKDET_EN_REG  0x17c8
+#define PCIE_PHY1_REFA_CLKDET_RESULT_REG  0x17c0
+#define PCIE_PHY1_REFB_CLKDET_RESULT_REG  0x17cc
+#define PCIE_PHY0_REFA_CLKDET_RESULT_REG  0x1798
+#define PCIE_PHY0_REFB_CLKDET_RESULT_REG  0x17A4
+
+#define XPCS_VR_XS_PMA_MP_12G_16G_25G_SRAM 0x6026c
+#define XPCA_SR_AN_CTRL 0x1C0000
+#define XPCS_REG_BASE 0x2c000000
+#define XPCS_REG_OFFSET 0x800000
+#define XPCS_12G_16G_25G_TX_GENCTRL0 0x600c0
+#define XGE01_QUEUES_TO_USE 8
+#define XGE23_QUEUES_TO_USE 2
+
+struct xge_private_data {
+	unsigned char phy_addr;
+	unsigned char bus_id;
+	unsigned char tx_queues_to_use;
+	unsigned char rx_queues_to_use;
+	unsigned int clk_ptp_rate;
+	unsigned int clk_csr;
+	unsigned char rx_queues_reserved;
+	unsigned char tx_queues_reserved;
+	unsigned char rx_queues_total;
+	unsigned char tx_queues_total;
+	unsigned char rx_used_mtl_queues;
+	unsigned char max_vfs;
+	phy_interface_t phy_interface;
+	int max_speed;
+	void __iomem *topaddr;
+	void __iomem *xpcsaddr;
+};
+
+enum XGE_LINK_MODE {
+	EXTERNAL_PHY_1000BASEX,
+	EXTERNAL_PHY_SGMII,
+	EXTERNAL_PHY_RGMII,
+	XPCS_PHY_1000BASEX,
+	XPCS_PHY_10GBASER,
+	XGE_LINK_MODE_UNKOWN,
+	XGE_LINK_MODE_MAX,
+};
+
+struct xge_link_config {
+	unsigned char has_xpcs;
+	unsigned int clk_ptp_rate;
+	unsigned int clk_csr;
+	phy_interface_t phy_interface;
+	int max_speed;
+	unsigned int clk_ref_rate;
+};
+
+struct dn200_gpio_data {
+	u32 gpio_addr_offset;
+	int sfp_detect_pin;
+	int sfp_tx_disable_pin;
+	int sfp_tx_fault_pin;
+	int sfp_rx_los_pin;
+	int sfp_rs0_pin;
+	int sfp_rs1_pin;
+	int sfp_led1_pin; // low active
+	int sfp_led2_pin; // low active
+	u16 reg_off_set_read;
+	u16 reg_off_set_write;
+};
+
+struct xge_private_data_id {
+	unsigned char pf_id;
+	const struct xge_private_data compat;
+};
+
+struct plat_dn200_data {
+	struct dn200_priv *priv_back;
+	struct pci_dev *pdev;
+	void __iomem *io_addr;
+	u8 rx_queues_total;
+	u8 tx_queues_total;
+	u8 rx_used_mtl_queues; //mtl queues used for rx
+	u8 rx_queues_reserved; //rx queues reserved for all VFs per PF
+	u8 tx_queues_reserved; //tx queues reserved for all VFs per PF
+	u8 rx_queue_start;
+	u8 tx_queue_start;
+	u8 *pf_max_iatu;
+	u8 *vf_total_iatu;
+	bool raid_supported; /* true: support raid; flase: not support raid */
+	bool use_msi;
+	bool is_vf;
+	bool sriov_cfg;
+	bool vf_flag;
+	bool sriov_supported;
+	bool nvme_supported;
+	bool vf_in_rst;
+	u8 max_vfs;
+	u8 pf_id;
+	u8 total_pfs;
+	int xpcs_index;
+	u8 vf_offset; /*vf index within a pf */
+	u8 funcid;
+	bool has_xpcs;
+	u8 total_irq;
+	u8 speed_cmd;
+	u8 vf_loss_hb_cnt[DN200_MAX_VF_NUM + 1];
+	struct dn200_ctrl_resource ctrl;
+	struct dn200_phy_info *phy_info;
+	struct dn200_pf_info pf;
+	struct dn200_gpio_data *gpio_data;
+	int msi_xpcs_vec;
+	u32 bitmap_vlan;
+	u16 max_num_vlan; /*Maximum number of vlans that can be configured */
+	u16 vlan_num; /*Number of configured vlans */
+	u16 vlan_id[64]; /* VF supports a maximum of 48 vlan_id */
+	int addr_bits_limit;
+	u64 addr_forbid_bits;
+	u8 hw_pcb_ver_type;
+	int max_speed;
+	u8 default_rx_queue_num;
+	u8 default_tx_queue_num;
+	u8 hw_rj45_type;
+	bool upgrade_with_flowing;
+};
+
+#define HW_IS_VF(hw) ((hw)->priv->plat_ex->is_vf)
+#define PRIV_IS_VF(priv) ((priv)->plat_ex->is_vf)
+#define PRIV_IS_PUREPF(priv) \
+	(!((priv)->plat_ex->is_vf || (priv)->plat_ex->sriov_supported))
+#define HW_IS_PUREPF(hw) \
+	(!((hw)->priv->plat_ex->is_vf || (hw)->priv->plat_ex->sriov_supported))
+#define DN200_LAST_QUEUE(priv) ((priv)->plat_ex->rx_queues_total - 1)
+#define DN200_MTL_QUEUE_IS_VALID(priv, queue)           \
+	(((queue) < (priv)->plat_ex->rx_used_mtl_queues) || \
+	 (!PRIV_IS_VF(priv) && (queue == DN200_LAST_QUEUE(priv))))
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_selftests.c b/drivers/net/ethernet/dapustor/dn200/dn200_selftests.c
new file mode 100644
index 000000000000..f762f49ad01d
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_selftests.c
@@ -0,0 +1,1108 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/bitrev.h>
+#include <linux/completion.h>
+#include <linux/crc32.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include <linux/phy.h>
+#include <linux/udp.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/tc_act/tc_gact.h>
+#include "dn200.h"
+#include "dn200_ctrl.h"
+#include <linux/if_vlan.h>
+#include <net/arp.h>
+struct dn200hdr {
+	__be32 version;
+	__be64 magic;
+	u8 id;
+} __packed;
+
+#define DN200_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+			      sizeof(struct dn200hdr))
+#define DN200_TEST_PKT_MAGIC	0xdeadcafecafedeadULL
+#define DN200_LB_TIMEOUT	msecs_to_jiffies(200)
+
+struct dn200_packet_attrs {
+	int vlan;
+	int vlan_id_in;
+	int vlan_id_out;
+	const unsigned char *src;
+	const unsigned char *dst;
+	u32 ip_src;
+	u32 ip_dst;
+	int tcp;
+	int sport;
+	int dport;
+	u32 exp_hash;
+	int dont_wait;
+	int timeout;
+	int size;
+	int max_size;
+	int remove_sa;
+	u8 id;
+	int sarc;
+	u16 queue_mapping;
+	u64 timestamp;
+};
+
+static u8 dn200_test_next_id;
+
+static struct sk_buff *dn200_test_get_udp_skb(struct dn200_priv *priv,
+					      struct dn200_packet_attrs *attr)
+{
+	struct sk_buff *skb = NULL;
+	struct udphdr *uhdr = NULL;
+	struct tcphdr *thdr = NULL;
+	struct dn200hdr *shdr;
+	struct ethhdr *ehdr;
+	struct iphdr *ihdr;
+	int iplen, size;
+
+	size = attr->size + DN200_TEST_PKT_SIZE;
+	if (attr->vlan) {
+		size += 4;
+		if (attr->vlan > 1)
+			size += 4;
+	}
+
+	if (attr->tcp)
+		size += sizeof(struct tcphdr);
+	else
+		size += sizeof(struct udphdr);
+
+	if (attr->max_size && attr->max_size > size)
+		size = attr->max_size;
+
+	skb = netdev_alloc_skb(priv->dev, size);
+	if (!skb)
+		return NULL;
+
+	prefetchw(skb->data);
+
+	if (attr->vlan > 1)
+		ehdr = skb_push(skb, ETH_HLEN + 8);
+	else if (attr->vlan)
+		ehdr = skb_push(skb, ETH_HLEN + 4);
+	else if (attr->remove_sa)
+		ehdr = skb_push(skb, ETH_HLEN - 6);
+	else
+		ehdr = skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+
+	skb_set_network_header(skb, skb->len);
+	ihdr = skb_put(skb, sizeof(*ihdr));
+
+	skb_set_transport_header(skb, skb->len);
+	if (attr->tcp)
+		thdr = skb_put(skb, sizeof(*thdr));
+	else
+		uhdr = skb_put(skb, sizeof(*uhdr));
+
+	if (!attr->remove_sa)
+		eth_zero_addr(ehdr->h_source);
+	eth_zero_addr(ehdr->h_dest);
+	if (attr->src && !attr->remove_sa)
+		ether_addr_copy(ehdr->h_source, attr->src);
+	if (attr->dst)
+		ether_addr_copy(ehdr->h_dest, attr->dst);
+
+	if (!attr->remove_sa) {
+		ehdr->h_proto = htons(ETH_P_IP);
+	} else {
+		/* HACK */
+		ehdr->h_proto = htons(ETH_P_IP);
+	}
+
+	if (attr->vlan) {
+		__be16 *tag, *proto;
+
+		if (!attr->remove_sa) {
+			tag = (void *)ehdr + ETH_HLEN;
+			proto = (void *)ehdr + (2 * ETH_ALEN);
+		} else {
+			tag = (void *)ehdr + ETH_HLEN - 6;
+			proto = (void *)ehdr + ETH_ALEN;
+		}
+
+		proto[0] = htons(ETH_P_8021Q);
+		tag[0] = htons(attr->vlan_id_out);
+		tag[1] = htons(ETH_P_IP);
+		if (attr->vlan > 1) {
+			proto[0] = htons(ETH_P_8021AD);
+			tag[1] = htons(ETH_P_8021Q);
+			tag[2] = htons(attr->vlan_id_in);
+			tag[3] = htons(ETH_P_IP);
+		}
+	}
+
+	if (attr->tcp) {
+		thdr->source = htons(attr->sport);
+		thdr->dest = htons(attr->dport);
+		thdr->doff = sizeof(struct tcphdr) / 4;
+		thdr->check = 0;
+	} else {
+		uhdr->source = htons(attr->sport);
+		uhdr->dest = htons(attr->dport);
+		uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size);
+		if (attr->max_size)
+			uhdr->len = htons(attr->max_size -
+					  (sizeof(*ihdr) + sizeof(*ehdr)));
+		uhdr->check = 0;
+	}
+
+	ihdr->ihl = 5;
+	ihdr->ttl = 32;
+	ihdr->version = 4;
+	if (attr->tcp)
+		ihdr->protocol = IPPROTO_TCP;
+	else
+		ihdr->protocol = IPPROTO_UDP;
+	iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size;
+	if (attr->tcp)
+		iplen += sizeof(*thdr);
+	else
+		iplen += sizeof(*uhdr);
+
+	if (attr->max_size)
+		iplen = attr->max_size - sizeof(*ehdr);
+
+	ihdr->tot_len = htons(iplen);
+	ihdr->frag_off = 0;
+	ihdr->saddr = htonl(attr->ip_src);
+	ihdr->daddr = htonl(attr->ip_dst);
+	ihdr->tos = 0;
+	ihdr->id = 0;
+	ip_send_check(ihdr);
+
+	shdr = skb_put(skb, sizeof(*shdr));
+	shdr->version = 0;
+	shdr->magic = cpu_to_be64(DN200_TEST_PKT_MAGIC);
+	attr->id = dn200_test_next_id;
+	shdr->id = dn200_test_next_id++;
+
+	if (attr->size)
+		skb_put(skb, attr->size);
+	if (attr->max_size && (attr->max_size > skb->len))
+		skb_put(skb, attr->max_size - skb->len);
+
+	skb->csum = 0;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	if (attr->tcp) {
+		thdr->check =
+		    ~tcp_v4_check(skb->len, ihdr->saddr, ihdr->daddr, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct tcphdr, check);
+	} else {
+		udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr);
+	}
+
+	skb->protocol = htons(ETH_P_IP);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = priv->dev;
+
+	if (attr->timestamp)
+		skb->tstamp = ns_to_ktime(attr->timestamp);
+
+	return skb;
+}
+
+static struct sk_buff *dn200_test_get_arp_skb(struct dn200_priv *priv,
+					      struct dn200_packet_attrs *attr)
+{
+	__be32 ip_src = htonl(attr->ip_src);
+	__be32 ip_dst = htonl(attr->ip_dst);
+	struct sk_buff *skb = NULL;
+
+	skb = arp_create(ARPOP_REQUEST, ETH_P_ARP, ip_dst, priv->dev, ip_src,
+			 NULL, attr->src, attr->dst);
+	if (!skb)
+		return NULL;
+
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = priv->dev;
+
+	return skb;
+}
+
+struct dn200_test_priv {
+	struct dn200_packet_attrs *packet;
+	struct packet_type pt;
+	struct completion comp;
+	int double_vlan;
+	int vlan_id;
+	int ok;
+};
+
+static int dn200_test_loopback_validate(struct sk_buff *skb,
+					struct net_device *ndev,
+					struct packet_type *pt,
+					struct net_device *orig_ndev)
+{
+	struct dn200_test_priv *tpriv = pt->af_packet_priv;
+	const unsigned char *src = tpriv->packet->src;
+	const unsigned char *dst = tpriv->packet->dst;
+	struct dn200hdr *shdr;
+	struct ethhdr *ehdr;
+	struct udphdr *uhdr;
+	struct tcphdr *thdr;
+	struct iphdr *ihdr;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	if (skb_linearize(skb))
+		goto out;
+	if (skb_headlen(skb) < (DN200_TEST_PKT_SIZE - ETH_HLEN))
+		goto out;
+
+	ehdr = (struct ethhdr *)skb_mac_header(skb);
+	if (dst) {
+		if (!ether_addr_equal_unaligned(ehdr->h_dest, dst))
+			goto out;
+	}
+	if (tpriv->packet->sarc) {
+		if (!ether_addr_equal_unaligned(ehdr->h_source, ehdr->h_dest))
+			goto out;
+	} else if (src) {
+		if (!ether_addr_equal_unaligned(ehdr->h_source, src))
+			goto out;
+	}
+
+	ihdr = ip_hdr(skb);
+	if (tpriv->double_vlan)
+		ihdr = (struct iphdr *)(skb_network_header(skb) + 4);
+
+	if (tpriv->packet->tcp) {
+		if (ihdr->protocol != IPPROTO_TCP)
+			goto out;
+
+		thdr = (struct tcphdr *)((u8 *) ihdr + 4 * ihdr->ihl);
+		if (thdr->dest != htons(tpriv->packet->dport))
+			goto out;
+
+		shdr = (struct dn200hdr *)((u8 *) thdr + sizeof(*thdr));
+	} else {
+		if (ihdr->protocol != IPPROTO_UDP)
+			goto out;
+
+		uhdr = (struct udphdr *)((u8 *) ihdr + 4 * ihdr->ihl);
+		if (uhdr->dest != htons(tpriv->packet->dport))
+			goto out;
+
+		shdr = (struct dn200hdr *)((u8 *) uhdr + sizeof(*uhdr));
+	}
+
+	if (shdr->magic != cpu_to_be64(DN200_TEST_PKT_MAGIC))
+		goto out;
+	if (tpriv->packet->exp_hash && !skb->hash)
+		goto out;
+	if (tpriv->packet->id != shdr->id)
+		goto out;
+
+	tpriv->ok = true;
+	complete(&tpriv->comp);
+out:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int __dn200_test_loopback(struct dn200_priv *priv,
+				 struct dn200_packet_attrs *attr)
+{
+	struct dn200_test_priv *tpriv;
+	struct sk_buff *skb = NULL;
+	int ret = 0;
+
+	tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+	if (!tpriv)
+		return -ENOMEM;
+
+	tpriv->ok = false;
+	init_completion(&tpriv->comp);
+
+	tpriv->pt.type = htons(ETH_P_IP);
+	tpriv->pt.func = dn200_test_loopback_validate;
+	tpriv->pt.dev = priv->dev;
+	tpriv->pt.af_packet_priv = tpriv;
+	tpriv->packet = attr;
+
+	if (!attr->dont_wait)
+		dev_add_pack(&tpriv->pt);
+
+	skb = dn200_test_get_udp_skb(priv, attr);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+	ret = dev_direct_xmit(skb, attr->queue_mapping);
+	if (ret)
+		goto cleanup;
+
+	if (attr->dont_wait)
+		goto cleanup;
+
+	if (!attr->timeout)
+		attr->timeout = DN200_LB_TIMEOUT;
+
+	wait_for_completion_timeout(&tpriv->comp, attr->timeout);
+	ret = tpriv->ok ? 0 : -ETIMEDOUT;
+cleanup:
+	if (!attr->dont_wait)
+		dev_remove_pack(&tpriv->pt);
+	kfree(tpriv);
+
+	return ret;
+}
+
+static int dn200_test_mac_loopback(struct dn200_priv *priv)
+{
+	struct dn200_packet_attrs attr = { };
+
+	attr.dst = priv->dev->dev_addr;
+	return __dn200_test_loopback(priv, &attr);
+}
+
+static int dn200_test_phy_loopback(struct dn200_priv *priv)
+{
+	struct dn200_packet_attrs attr = { };
+	int ret;
+
+	if (!priv->dev->phydev)
+		return -EOPNOTSUPP;
+
+	attr.dst = priv->dev->dev_addr;
+	ret = __dn200_test_loopback(priv, &attr);
+
+	return ret;
+}
+
+static int dn200_test_mmc(struct dn200_priv *priv)
+{
+	struct dn200_counters *initial, *final;
+	int ret;
+
+	if (!priv->dma_cap.rmon)
+		return -EOPNOTSUPP;
+
+	initial = kzalloc(sizeof(*initial), GFP_KERNEL);
+	if (!initial)
+		return -ENOMEM;
+
+	final = kzalloc(sizeof(*final), GFP_KERNEL);
+	if (!final) {
+		ret = -ENOMEM;
+		goto out_free_initial;
+	}
+	memset(initial, 0, sizeof(*initial));
+	memset(final, 0, sizeof(*final));
+	/* Save previous results into internal struct */
+	dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+	memcpy(initial, &priv->mmc, sizeof(*initial));
+
+	ret = dn200_test_mac_loopback(priv);
+	if (ret)
+		goto out_free_final;
+
+	/* These will be loopback results so no need to save them */
+	dn200_mmc_read(priv, priv->mmcaddr, &priv->mmc);
+	memcpy(final, &priv->mmc, sizeof(*final));
+	/* The number of MMC counters available depends on HW configuration
+	 * so we just use this one to validate the feature. I hope there is
+	 * not a version without this counter.
+	 */
+	if (final->mmc_tx_framecount_g <= initial->mmc_tx_framecount_g) {
+		ret = -EINVAL;
+		goto out_free_final;
+	}
+
+out_free_final:
+	kfree(final);
+out_free_initial:
+	kfree(initial);
+	return ret;
+}
+
+static int dn200_filter_check(struct dn200_priv *priv)
+{
+	if (!(priv->dev->flags & IFF_PROMISC))
+		return 0;
+
+	netdev_warn(priv->dev, "Test can't be run in promiscuous mode!\n");
+	return -EOPNOTSUPP;
+}
+
+static bool dn200_hash_check(struct dn200_priv *priv, unsigned char *addr)
+{
+	int mc_offset = 32 - priv->hw->mcast_bits_log2;
+	struct netdev_hw_addr *ha;
+	u32 hash, hash_nr;
+
+	/* First compute the hash for desired addr */
+	hash = bitrev32(~crc32_le(~0, addr, 6)) >> mc_offset;
+	hash_nr = hash >> 5;
+	hash = 1 << (hash & 0x1f);
+
+	/* Now, check if it collides with any existing one */
+	netdev_for_each_mc_addr(ha, priv->dev) {
+		u32 nr =
+		    bitrev32(~crc32_le(~0, ha->addr, ETH_ALEN)) >> mc_offset;
+		if (((nr >> 5) == hash_nr) && ((1 << (nr & 0x1f)) == hash))
+			return false;
+	}
+
+	/* No collisions, address is good to go */
+	return true;
+}
+
+static bool dn200_perfect_check(struct dn200_priv *priv, unsigned char *addr)
+{
+	struct netdev_hw_addr *ha;
+
+	/* Check if it collides with any existing one */
+	netdev_for_each_uc_addr(ha, priv->dev) {
+		if (!memcmp(ha->addr, addr, ETH_ALEN))
+			return false;
+	}
+
+	/* No collisions, address is good to go */
+	return true;
+}
+
+static int dn200_test_hfilt(struct dn200_priv *priv)
+{
+	unsigned char gd_addr[ETH_ALEN] = { 0xf1, 0xee, 0xdd, 0xcc, 0xbb, 0xaa };
+	unsigned char bd_addr[ETH_ALEN] = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct dn200_packet_attrs attr = { };
+	int ret, tries = 256;
+
+	ret = dn200_filter_check(priv);
+	if (ret)
+		return ret;
+
+	if (netdev_mc_count(priv->dev) >= priv->hw->multicast_filter_bins)
+		return -EOPNOTSUPP;
+
+	while (--tries) {
+		/* We only need to check the bd_addr for collisions */
+		bd_addr[ETH_ALEN - 1] = tries;
+		if (dn200_hash_check(priv, bd_addr))
+			break;
+	}
+	if (!tries)
+		return -EOPNOTSUPP;
+
+	ret = dev_mc_add(priv->dev, gd_addr);
+	if (ret)
+		return ret;
+	usleep_range(10000, 20000);
+	attr.dst = gd_addr;
+
+	/* Shall receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	if (ret)
+		goto cleanup;
+
+	attr.dst = bd_addr;
+
+	/* Shall NOT receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	ret = ret ? 0 : -EINVAL;
+
+cleanup:
+	dev_mc_del(priv->dev, gd_addr);
+	usleep_range(10000, 20000);
+	return ret;
+}
+
+static int dn200_test_pfilt(struct dn200_priv *priv)
+{
+	unsigned char gd_addr[ETH_ALEN] = { 0xf0, 0x01, 0x44, 0x55, 0x66, 0x77 };
+	unsigned char bd_addr[ETH_ALEN] = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct dn200_packet_attrs attr = { };
+	int ret, tries = 256;
+
+	if (dn200_filter_check(priv))
+		return -EOPNOTSUPP;
+	if (netdev_uc_count(priv->dev) >= priv->hw->unicast_filter_entries)
+		return -EOPNOTSUPP;
+
+	while (--tries) {
+		/* We only need to check the bd_addr for collisions */
+		bd_addr[ETH_ALEN - 1] = tries;
+		if (dn200_perfect_check(priv, bd_addr))
+			break;
+	}
+
+	if (!tries)
+		return -EOPNOTSUPP;
+
+	ret = dev_uc_add(priv->dev, gd_addr);
+	if (ret)
+		return ret;
+	usleep_range(10000, 20000);
+	attr.dst = gd_addr;
+
+	/* Shall receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	if (ret)
+		goto cleanup;
+
+	attr.dst = bd_addr;
+
+	/* Shall NOT receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	ret = ret ? 0 : -EINVAL;
+
+cleanup:
+	dev_uc_del(priv->dev, gd_addr);
+	usleep_range(10000, 20000);
+	return ret;
+}
+
+static int dn200_test_mcfilt(struct dn200_priv *priv)
+{
+	unsigned char uc_addr[ETH_ALEN] = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff };
+	unsigned char mc_addr[ETH_ALEN] = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct dn200_packet_attrs attr = { };
+	int ret, tries = 256;
+
+	if (dn200_filter_check(priv))
+		return -EOPNOTSUPP;
+	if (netdev_uc_count(priv->dev) >= priv->hw->unicast_filter_entries)
+		return -EOPNOTSUPP;
+	if (netdev_mc_count(priv->dev) >= priv->hw->multicast_filter_bins)
+		return -EOPNOTSUPP;
+	while (--tries) {
+		/* We only need to check the mc_addr for collisions */
+		mc_addr[ETH_ALEN - 1] = tries;
+		if (dn200_hash_check(priv, mc_addr))
+			break;
+	}
+	if (!tries)
+		return -EOPNOTSUPP;
+
+	ret = dev_uc_add(priv->dev, uc_addr);
+	if (ret)
+		return ret;
+	usleep_range(10000, 20000);
+	attr.dst = uc_addr;
+
+	/* Shall receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	if (ret)
+		goto cleanup;
+
+	attr.dst = mc_addr;
+
+	/* Shall NOT receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	ret = ret ? 0 : -EINVAL;
+
+cleanup:
+	dev_uc_del(priv->dev, uc_addr);
+	usleep_range(10000, 20000);
+	return ret;
+}
+
+static int dn200_test_ucfilt(struct dn200_priv *priv)
+{
+	unsigned char uc_addr[ETH_ALEN] = { 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff };
+	unsigned char mc_addr[ETH_ALEN] = { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct dn200_packet_attrs attr = { };
+	int ret, tries = 256;
+
+	if (dn200_filter_check(priv))
+		return -EOPNOTSUPP;
+	if (netdev_uc_count(priv->dev) >= priv->hw->unicast_filter_entries)
+		return -EOPNOTSUPP;
+	if (netdev_mc_count(priv->dev) >= priv->hw->multicast_filter_bins)
+		return -EOPNOTSUPP;
+
+	while (--tries) {
+		/* We only need to check the uc_addr for collisions */
+		uc_addr[ETH_ALEN - 1] = tries;
+		if (dn200_perfect_check(priv, uc_addr))
+			break;
+	}
+
+	if (!tries)
+		return -EOPNOTSUPP;
+
+	ret = dev_mc_add(priv->dev, mc_addr);
+	if (ret)
+		return ret;
+	usleep_range(10000, 20000);
+	attr.dst = mc_addr;
+
+	/* Shall receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	if (ret)
+		goto cleanup;
+
+	attr.dst = uc_addr;
+
+	/* Shall NOT receive packet */
+	ret = __dn200_test_loopback(priv, &attr);
+	ret = ret ? 0 : -EINVAL;
+
+cleanup:
+	dev_mc_del(priv->dev, mc_addr);
+	usleep_range(10000, 20000);
+	return ret;
+}
+static inline bool dn200_napi_reschedule(struct napi_struct *napi)
+{
+	if (napi_schedule_prep(napi)) {
+		__napi_schedule(napi);
+		return true;
+	}
+	return false;
+}
+
+static int dn200_test_rss(struct dn200_priv *priv)
+{
+	struct dn200_packet_attrs attr = { };
+
+	if (!priv->dma_cap.rssen || !priv->rss.enable)
+		return -EOPNOTSUPP;
+
+	attr.src = priv->dev->dev_addr;
+	attr.dst = priv->dev->dev_addr;
+	attr.ip_dst = 0x728A86D7;
+	attr.ip_src = 0x728A86D8;
+	attr.exp_hash = true;
+	return __dn200_test_loopback(priv, &attr);
+}
+
+
+static int dn200_test_desc_sar(struct dn200_priv *priv)
+{
+	unsigned char src[ETH_ALEN] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	struct dn200_packet_attrs attr = { };
+	int ret;
+
+	if (!priv->dma_cap.vlins)
+		return -EOPNOTSUPP;
+
+	attr.sarc = true;
+	attr.src = src;
+	attr.dst = priv->dev->dev_addr;
+
+	priv->sarc_type = 0x2;
+
+	ret = __dn200_test_loopback(priv, &attr);
+
+	priv->sarc_type = 0x0;
+	return ret;
+}
+
+static int dn200_test_reg_sar(struct dn200_priv *priv)
+{
+	unsigned char src[ETH_ALEN] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	struct dn200_packet_attrs attr = { };
+	int ret;
+
+	if (!priv->dma_cap.vlins)
+		return -EOPNOTSUPP;
+
+	attr.sarc = true;
+	attr.src = src;
+	attr.dst = priv->dev->dev_addr;
+
+	if (dn200_sarc_configure(priv, priv->ioaddr, 0x3))
+		return -EOPNOTSUPP;
+
+	ret = __dn200_test_loopback(priv, &attr);
+
+	dn200_sarc_configure(priv, priv->ioaddr, 0x0);
+	return ret;
+}
+
+
+static int dn200_test_arp_validate(struct sk_buff *skb,
+				   struct net_device *ndev,
+				   struct packet_type *pt,
+				   struct net_device *orig_ndev)
+{
+	struct dn200_test_priv *tpriv = pt->af_packet_priv;
+	struct ethhdr *ehdr;
+	struct arphdr *ahdr;
+
+	ehdr = (struct ethhdr *)skb_mac_header(skb);
+	if (!ether_addr_equal_unaligned(ehdr->h_dest, tpriv->packet->src))
+		goto out;
+
+	ahdr = arp_hdr(skb);
+	if (ahdr->ar_op != htons(ARPOP_REPLY))
+		goto out;
+
+	tpriv->ok = true;
+	complete(&tpriv->comp);
+out:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int dn200_test_arpoffload(struct dn200_priv *priv)
+{
+	unsigned char src[ETH_ALEN] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 };
+	unsigned char dst[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct dn200_packet_attrs attr = { };
+	struct dn200_test_priv *tpriv;
+	struct sk_buff *skb = NULL;
+	u32 ip_addr = 0xdeadcafe;
+	u32 ip_src = 0xdeadbeef;
+	int ret;
+
+	if (!priv->dma_cap.arpoffsel)
+		return -EOPNOTSUPP;
+
+	tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+	if (!tpriv)
+		return -ENOMEM;
+
+	tpriv->ok = false;
+	init_completion(&tpriv->comp);
+
+	tpriv->pt.type = htons(ETH_P_ARP);
+	tpriv->pt.func = dn200_test_arp_validate;
+	tpriv->pt.dev = priv->dev;
+	tpriv->pt.af_packet_priv = tpriv;
+	tpriv->packet = &attr;
+	dev_add_pack(&tpriv->pt);
+
+	attr.src = src;
+	attr.ip_src = ip_src;
+	attr.dst = dst;
+	attr.ip_dst = ip_addr;
+
+	skb = dn200_test_get_arp_skb(priv, &attr);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+
+	ret = dn200_set_arp_offload(priv, priv->hw, true, ip_addr);
+	if (ret)
+		goto cleanup;
+
+	ret = dev_set_promiscuity(priv->dev, 1);
+	if (ret)
+		goto cleanup;
+	usleep_range(10000, 20000);
+	ret = dev_direct_xmit(skb, 0);
+	if (ret)
+		goto cleanup_promisc;
+
+	wait_for_completion_timeout(&tpriv->comp, DN200_LB_TIMEOUT);
+	ret = tpriv->ok ? 0 : -ETIMEDOUT;
+
+cleanup_promisc:
+	dev_set_promiscuity(priv->dev, -1);
+cleanup:
+	dn200_set_arp_offload(priv, priv->hw, false, 0x0);
+	dev_remove_pack(&tpriv->pt);
+	usleep_range(10000, 20000);
+	kfree(tpriv);
+	return ret;
+}
+
+static int __dn200_test_jumbo(struct dn200_priv *priv, u16 queue)
+{
+	struct dn200_packet_attrs attr = { };
+	int size = priv->dma_buf_sz;
+
+	attr.dst = priv->dev->dev_addr;
+	attr.max_size = size;
+	attr.queue_mapping = queue;
+
+	return __dn200_test_loopback(priv, &attr);
+}
+
+static int dn200_test_jumbo(struct dn200_priv *priv)
+{
+	return __dn200_test_jumbo(priv, 0);
+}
+
+static int dn200_test_mjumbo(struct dn200_priv *priv)
+{
+	u32 chan, tx_cnt = priv->plat->tx_queues_to_use;
+	int ret;
+
+	for (chan = 0; chan < tx_cnt; chan++) {
+		ret = __dn200_test_jumbo(priv, chan);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define DN200_LOOPBACK_NONE 0
+#define DN200_LOOPBACK_MAC 1
+#define DN200_LOOPBACK_PHY	2
+
+#define PUREPF_XPCS BIT(0)
+#define PUREPF_PHY BIT(1)
+#define SRIOVPF_XPCS BIT(2)
+#define SRIOVPF_PHY BIT(3)
+#define SRIOVVF_XPCS BIT(4)
+#define SRIOVVF_PHY BIT(5)
+
+static const struct dn200_test {
+	char name[ETH_GSTRING_LEN];
+	int lb;
+	int (*fn)(struct dn200_priv *priv);
+	u16 type;
+} dn200_selftests[] = {
+	{
+		.name = "MAC Loopback               ",
+		.lb = DN200_LOOPBACK_MAC,
+		.fn = dn200_test_mac_loopback,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+	{
+		.name = "PHY Loopback               ",
+		.lb = DN200_LOOPBACK_PHY, /* Test will handle it */
+		.fn = dn200_test_phy_loopback,
+		.type = PUREPF_PHY | SRIOVPF_PHY,
+	},
+	{
+		.name = "MMC Counters               ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_mmc,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+	{
+		.name = "Hash Filter MC             ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_hfilt,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+	{
+		.name = "Perfect Filter UC          ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_pfilt,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+	{
+		.name = "MC Filter                  ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_mcfilt,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+	{
+		.name = "UC Filter                  ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_ucfilt,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	}, {
+		.name = "RSS                        ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_rss,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_PHY | SRIOVPF_XPCS,
+	},
+	{
+		.name = "SA Replacement (desc)      ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_desc_sar,
+		.type = PUREPF_PHY | SRIOVPF_PHY,
+	}, {
+		.name = "SA Replacement (reg)       ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_reg_sar,
+		.type = PUREPF_PHY | SRIOVPF_PHY,
+	},
+	{
+		.name = "ARP Offload                ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_arpoffload,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	}, {
+		.name = "Jumbo Frame                ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_jumbo,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	}, {
+		.name = "Multichannel Jumbo         ",
+		.lb = DN200_LOOPBACK_PHY,
+		.fn = dn200_test_mjumbo,
+		.type = PUREPF_XPCS | PUREPF_PHY | SRIOVPF_XPCS | SRIOVPF_PHY,
+	},
+};
+
+static u16 dn200_seltftest_get_type(struct dn200_priv *priv)
+{
+	u16 type = 0;
+
+	if (priv->dev->phydev) {
+		if (PRIV_IS_VF(priv))
+			type = SRIOVVF_PHY;
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			type = SRIOVPF_PHY;
+		else
+			type = PUREPF_PHY;
+	} else {
+		if (PRIV_IS_VF(priv))
+			type = SRIOVVF_XPCS;
+		else if (PRIV_SRIOV_SUPPORT(priv))
+			type = SRIOVPF_XPCS;
+		else
+			type = PUREPF_XPCS;
+	}
+	return type;
+}
+
+void dn200_selftest_run(struct net_device *dev,
+			struct ethtool_test *etest, u64 *buf)
+{
+	struct dn200_priv *priv = netdev_priv(dev);
+	int count = dn200_selftest_get_count(priv);
+	int i, ret, j = 0;
+	u16 type = dn200_seltftest_get_type(priv);
+	u16 val = 0;
+
+	memset(buf, 0, sizeof(*buf) * count);
+	dn200_test_next_id = 0;
+	if (PRIV_IS_VF(priv)) {
+		netdev_err(priv->dev, "VF self tests are not supported\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	if (etest->flags != ETH_TEST_FL_OFFLINE) {
+		/*phyloopback needs on line */
+		if (type & dn200_selftests[1].type) {
+			netdev_err(priv->dev,
+				   "Only offline tests are supported\n");
+			etest->flags |= ETH_TEST_FL_FAILED;
+			return;
+		}
+
+	} else if (!netif_carrier_ok(dev)) {
+		netdev_err(priv->dev, "You need valid Link to execute tests\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	/* Wait for queues drain */
+	msleep(200);
+	/*mac selftest first*/
+	ret = 0;
+	ret = dn200_set_mac_loopback(priv, priv->ioaddr, true);
+	if (ret) {
+		netdev_err(priv->dev, "Loopback is not supported\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+	for (i = 0; i < ARRAY_SIZE(dn200_selftests); i++) {
+		if (dn200_selftests[i].lb != DN200_LOOPBACK_MAC)
+			continue;
+		if (type & dn200_selftests[i].type) {
+			ret = dn200_selftests[i].fn(priv);
+			if (ret && (ret != -EOPNOTSUPP))
+				etest->flags |= ETH_TEST_FL_FAILED;
+			buf[j] = ret;
+			j++;
+		}
+	}
+	ret = dn200_set_mac_loopback(priv, priv->ioaddr, false);
+	if (ret) {
+		netdev_err(priv->dev, "Loopback is not supported\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+	/*phy selftest*/
+
+	if (dev->phydev)
+		priv->plat_ex->phy_info->phy_loopback_flag = true;
+
+	if (dev->phydev) {
+		val = phy_read(dev->phydev, MII_BMCR);
+		ret = phy_write(dev->phydev, MII_BMCR, 0x4140);
+		msleep(2000);
+		if (ret)
+			ret = dn200_set_mac_loopback(priv, priv->ioaddr, true);
+	} else {
+		ret = dn200_set_mac_loopback(priv, priv->ioaddr, true);
+	}
+	if (ret) {
+		netdev_err(priv->dev, "Loopback is not supported\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(dn200_selftests); i++) {
+		ret = 0;
+		if (dn200_selftests[i].lb != DN200_LOOPBACK_PHY)
+			continue;
+		/*
+		 * First tests will always be MAC / PHY loobpack. If any of
+		 * them is not supported we abort earlier.
+		 */
+		if (type & dn200_selftests[i].type) {
+			ret = dn200_selftests[i].fn(priv);
+			if (ret && (ret != -EOPNOTSUPP))
+				etest->flags |= ETH_TEST_FL_FAILED;
+			buf[j] = ret;
+			j++;
+		}
+	}
+
+	if (dev->phydev) {
+		ret = phy_write(dev->phydev, MII_BMCR, val | BMCR_RESET);
+		if (ret)
+			ret = dn200_set_mac_loopback(priv, priv->ioaddr, false);
+	} else {
+		ret = dn200_set_mac_loopback(priv, priv->ioaddr, false);
+	}
+
+	if (ret) {
+		netdev_err(priv->dev, "Loopback is not supported\n");
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	if (dev->phydev) {
+		usleep_range(3000000, 4000000);
+		priv->plat_ex->phy_info->phy_loopback_flag = false;
+	}
+}
+
+void dn200_selftest_get_strings(struct dn200_priv *priv, u8 *data)
+{
+	u8 *p = data;
+	int i;
+	int j = 0;
+	u16 type = dn200_seltftest_get_type(priv);
+
+	for (i = 0; i < ARRAY_SIZE(dn200_selftests); i++) {
+		if (type & dn200_selftests[i].type) {
+			snprintf(p, ETH_GSTRING_LEN, "%2d. %s", j + 1,
+				 dn200_selftests[i].name);
+			p += ETH_GSTRING_LEN;
+			j++;
+		}
+	}
+}
+
+int dn200_selftest_get_count(struct dn200_priv *priv)
+{
+	int i, j = 0;
+	u16 type = dn200_seltftest_get_type(priv);
+
+	for (i = 0; i < ARRAY_SIZE(dn200_selftests); i++) {
+		if (type & dn200_selftests[i].type)
+			j++;
+	}
+	return j;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.c b/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.c
new file mode 100644
index 000000000000..2abbc01b088c
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Wang Ai-Yong <aiyong@dapustor.com>
+ *
+ * Get special configurations for dn200
+ */
+
+#include "common.h"
+#include "dn200_spec_acc.h"
+#include "dn200_spec_def.h"
+#include "dn200_sriov.h"
+#include "dn200.h"
+
+static bool feat_has_priv(struct dn200_priv *priv, enum FEATURE_ID feat_id)
+{
+	int feat_priv = 0;
+
+	WARN_ON(spec_table[feat_id].feat_id != feat_id);
+
+	if (!PRIV_SRIOV_SUPPORT(priv))
+		feat_priv = FEAT_PRIV_PURE_PF;
+	else if (PRIV_SRIOV_SUPPORT(priv))
+		feat_priv = FEAT_PRIV_SRIOV_PF;
+	else if (PRIV_IS_VF(priv))
+		feat_priv = FEAT_PRIV_VF;
+	else
+		dev_err(priv->device, "%s: ERROR: don't exist valid driver type.\n",
+			__func__);
+
+	return ((spec_table[feat_id].feat_priv & feat_priv) != 0);
+}
+
+static enum DRV_TYPE drv_type_get(struct dn200_priv *priv)
+{
+	if (!PRIV_SRIOV_SUPPORT(priv) && !PRIV_IS_VF(priv))
+		return DRV_PURE_PF;
+
+	if (PRIV_SRIOV_SUPPORT(priv))
+		return DRV_SRIOV_PF;
+
+	if (PRIV_IS_VF(priv))
+		return DRV_VF;
+
+	/* Can't reach here, should have valid driver type */
+	WARN_ON(true);
+	return DRV_PURE_PF;
+}
+
+/**
+ * dn200_feat_support - Get the feature supported status
+ * @feat_id: feature id
+ * @priv: driver private structure
+ */
+bool dn200_feat_support(struct dn200_priv *priv, enum FEATURE_ID feat_id)
+{
+	if (feat_has_priv(priv, feat_id))
+		return true;
+
+	return false;
+}
+
+/**
+ * dn200_max_mtu_get - Get max mtu
+ * @priv: driver private structure
+ * @mtu: max mtu
+ */
+int dn200_max_mtu_get(struct dn200_priv *priv, int *max_mtu, int *min_mtu)
+{
+	enum FEATURE_ID feat_id = FEAT_MTU_JUMB;
+	enum DRV_TYPE drv_type = drv_type_get(priv);
+	int pf_id = priv->plat_ex->pf_id;
+	struct feat_mtu_spec *mtu_spec = spec_table[feat_id].feat_spec;
+
+	if (feat_has_priv(priv, feat_id)) {
+		*max_mtu = mtu_spec[drv_type].max_mtu[pf_id];
+		*min_mtu = mtu_spec[drv_type].min_mtu;
+		return 0;
+	}
+
+	return -EACCES;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.h b/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.h
new file mode 100644
index 000000000000..66082161d2e9
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_spec_acc.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_SEPC_ACC_H__
+#define __DN200_SEPC_ACC_H__
+
+enum FEATURE_ID {
+	FEAT_MTU_JUMB = 0,
+	FEAT_RSS,
+	FEAT_XXX,
+
+	FEAT_MAX_ID
+};
+
+bool dn200_feat_support(struct dn200_priv *priv, enum FEATURE_ID feat_id);
+int dn200_max_mtu_get(struct dn200_priv *priv, int *max_mtu, int *min_mtu);
+
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_spec_def.h b/drivers/net/ethernet/dapustor/dn200/dn200_spec_def.h
new file mode 100644
index 000000000000..b59f01e842cd
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_spec_def.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Wang Ai-Yong <aiyong@dapustor.com>
+ *
+ * Get special configurations for dn200
+ */
+
+#define __DN200_SEPC_DEF_H__
+
+#include "common.h"
+
+/* feature privileges */
+#define FEAT_PRIV_PURE_PF	0x01	/* Feature can run on pure pf (Pure PF don't support SRIOV) */
+#define FEAT_PRIV_SRIOV_PF	0x20	/* Feature can run on SRIOV pf */
+#define FEAT_PRIV_VF		0x40	/* Feature can run on vf */
+#define FEAT_PRIV_NONE		0x0	/* Feature can't run on any condition */
+
+/* MTU or Jumbo feature for every driver type */
+const struct feat_mtu_spec {
+	int max_mtu[XGE_NUM];
+	int min_mtu;
+} mtu_spec[DRV_TYPE_MAX] = {
+	[DRV_PURE_PF] = {
+			 /*Hardware scheduling takes up space,a certain space must be reserved */
+			 .max_mtu[0] = 9600,
+			 .max_mtu[1] = 9600,
+			 .max_mtu[2] = 9600,	/* 9600 */
+			 .max_mtu[3] = 9600,	/* 9600 */
+			 .min_mtu = 68,
+			  },
+	[DRV_SRIOV_PF] = {
+			  .max_mtu[0] = 9600,
+			  .max_mtu[1] = 9600,
+			  .max_mtu[2] = 9600,	/* 9600 */
+			  .max_mtu[3] = 9600,	/* 9600 */
+			  .min_mtu = 68,
+			   },
+	[DRV_VF] = {
+				/* 2KB, VF FIFO size is 3KB(refer DN200_01_VF_TX_FIFO_SIZE),
+				 * 2K mtu TSO works fine
+				 */
+				.max_mtu[0] = 1500,
+				.max_mtu[1] = 1500,	/* 2KB */
+				.max_mtu[2] = 9600,	/* 9600 */
+				.max_mtu[3] = 9600,	/* 9600 */
+				.min_mtu = 68,
+			},
+};
+
+/* XXX feature */
+
+const struct dn200_feat_spec {
+	int feat_id;
+	int feat_priv;
+	void *feat_spec;
+} spec_table[FEAT_MAX_ID] = {
+	{
+		.feat_id = FEAT_MTU_JUMB,
+		.feat_priv = FEAT_PRIV_PURE_PF | FEAT_PRIV_SRIOV_PF | FEAT_PRIV_VF,
+		.feat_spec = (void *)mtu_spec,
+	},
+	{
+		.feat_id = FEAT_RSS,
+		.feat_priv = FEAT_PRIV_PURE_PF | FEAT_PRIV_SRIOV_PF,
+		.feat_spec = NULL,
+	},
+	{
+		.feat_id = FEAT_XXX,
+		.feat_priv = FEAT_PRIV_PURE_PF,
+		.feat_spec = NULL,
+	}
+};
+
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_sriov.c b/drivers/net/ethernet/dapustor/dn200/dn200_sriov.c
new file mode 100644
index 000000000000..939ad65b655a
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_sriov.c
@@ -0,0 +1,907 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config dn200 sriov
+ */
+#include <linux/bitrev.h>
+#include <linux/crc32.h>
+#include <linux/printk.h>
+#include <linux/iopoll.h>
+#include <linux/pci.h>
+#include "dn200.h"
+#include "dn200_sriov.h"
+
+void dn200_sriov_reconfig_hw_feature(struct dn200_priv *priv,
+				     struct dma_features *dma_cap)
+{
+	int tx_fifo_size = 0;
+	bool port_is_0_and_1 = false;
+
+	if (!PRIV_IS_VF(priv)) {
+		if (priv->plat_ex->pf_id < 2)
+			port_is_0_and_1 = true;
+		else
+			port_is_0_and_1 = false;
+	}
+
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		if (port_is_0_and_1) {
+			tx_fifo_size = DN200_SRIOV_01_TX_FIFO_SIZE;
+			priv->tx_fifo_queue_0 = DN200_SRIOV_Q0_TX_FIFO_SIZE;
+			dma_cap->tx_fifo_size =
+			DN200_01_MAX_FIFO_SIZE - tx_fifo_size * priv->plat_ex->max_vfs;
+			if (priv->plat_ex->max_speed == SPEED_10000) {
+				priv->mtl_queue_fifo_avg = DN200_RX_TC_FIFO_SIZE;
+				priv->mtl_queue_fifo_more = DN200_RX_SUPER_FIFO_SIZE;
+			} else {
+				priv->mtl_queue_fifo_avg = DN200_RX_TC_FIFO_SIZE_1G;
+				priv->mtl_queue_fifo_more = DN200_RX_SUPER_FIFO_SIZE_1G;
+			}
+			priv->vf_tx_fifo_size = tx_fifo_size;
+		} else {
+			priv->mtl_queue_fifo_avg = DN200_RX_FIFO_SIZE_PORT2_3;
+			priv->mtl_queue_fifo_more = DN200_RX_FIFO_SIZE_PORT2_3;
+			priv->tx_fifo_queue_0 = DN200_TX_FIFO_SIZE_PORT2_3;
+			dma_cap->tx_fifo_size = DN200_TX_FIFO_SIZE_PORT2_3;
+			dma_cap->rx_fifo_size = DN200_RX_FIFO_SIZE_PORT2_3;
+			priv->vf_tx_fifo_size = DN200_TX_FIFO_SIZE_PORT2_3;
+		}
+	} else if (PRIV_IS_PUREPF(priv)) {
+		if (port_is_0_and_1) {
+			tx_fifo_size = DN200_PURE_01_TX_FIFO_SIZE;
+			priv->tx_fifo_queue_0 = DN200_PURE_Q0_TX_FIFO_SIZE;
+			dma_cap->tx_fifo_size =
+				tx_fifo_size * (priv->plat_ex->default_tx_queue_num - 1) + priv->tx_fifo_queue_0;
+			if (priv->plat_ex->max_speed == SPEED_10000) {
+				/* no need give more fifo to last queue */
+				priv->mtl_queue_fifo_avg = DN200_PURE_RX_TC_FIFO_SIZE;
+				priv->mtl_queue_fifo_more = DN200_PURE_RX_SUPER_FIFO_SIZE;
+			} else {
+				/* no need give more fifo to last queue */
+				priv->mtl_queue_fifo_avg = DN200_PURE_RX_TC_FIFO_SIZE;
+				priv->mtl_queue_fifo_more = DN200_PURE_RX_SUPER_FIFO_SIZE;
+			}
+		} else {
+			priv->mtl_queue_fifo_avg = DN200_23_MAX_FIFO_SIZE;
+			priv->mtl_queue_fifo_more = 0;
+			priv->tx_fifo_queue_0 = DN200_TX_FIFO_SIZE_PORT2_3;
+			dma_cap->tx_fifo_size = DN200_23_MAX_FIFO_SIZE;
+			dma_cap->rx_fifo_size = DN200_23_MAX_FIFO_SIZE;
+		}
+	}
+	dma_cap->vlhash = 0;
+}
+
+int dn200_sriov_vlan_entry_update(struct dn200_priv *priv)
+{
+	int ret = 0;
+	int i = 0;
+	u16 vid = 0;
+
+	if (priv->plat_ex->vlan_num > (priv->hw->max_vlan_num + 1))
+		return ret;
+	for_each_set_bit(vid, priv->active_vlans, VLAN_N_VID) {
+		__le16 vid_le = cpu_to_le16(vid);
+
+		ret =
+		    dn200_add_hw_vlan_rx_fltr(priv, NULL, priv->hw, 0,
+					      vid_le, i,
+					      i == (priv->plat_ex->vlan_num - 1));
+		if (ret)
+			return ret;
+		i++;
+	}
+	return ret;
+}
+
+int dn200_get_prev_used_bit(unsigned long *bitmap, u8 offset)
+{
+	int i = 0;
+
+	if (offset > DN200_MC_ADDR_END)
+		return -1;
+
+	if (offset)
+		i = offset - 1;
+	else
+		return -1;
+
+	for (; i >= 0; i--) {
+		if (bitmap[0] & (1ULL << (i)))
+			return i;
+	}
+	return -1;
+}
+
+int dn200_get_next_used_bit(unsigned long *bitmap, u8 offset, u8 last)
+{
+	int i = offset + 1;
+
+	if (offset > DN200_MC_ADDR_END)
+		return -1;
+	if (last > DN200_MC_ADDR_END)
+		return -1;
+
+	for (; i < last; i++) {
+		if (bitmap[0] & (1ULL << (i)))
+			return i;
+	}
+
+	if (i == last)
+		i = -1;
+	return i;
+}
+
+int dn200_get_unused_bit(unsigned long *bitmap, u8 last, u8 first)
+{
+	int i = first;
+
+	if (last > DN200_MC_ADDR_END)
+		return -1;
+	for (; i <= last; i++) {
+		if (!(bitmap[0] & (1ULL << (i))))
+			return i;
+	}
+	return -1;
+}
+
+void dn200_get_func_uc_mac_addr(struct mac_device_info *hw, u8 offset,
+				struct mac_addr_route *mac_addr)
+{
+	int i = 0;
+
+	for (; i < sizeof(struct mac_addr_route); i++) {
+		*((u8 *) mac_addr + i) =
+		    readb(LRAM_MAC_PF_OFFSET(hw) +
+			  offsetof(struct dn200_mailbox_info,
+				   pf_uc_mac_addr[offset]) + i);
+	}
+}
+
+void dn200_set_func_uc_mac_addr(struct mac_device_info *hw, u8 offset,
+				struct mac_addr_route *mac_addr)
+{
+	int i = 0;
+
+	for (; i < sizeof(struct mac_addr_route); i++) {
+		writeb(*((u8 *) mac_addr + i),
+		       LRAM_MAC_PF_OFFSET(hw) +
+		       offsetof(struct dn200_mailbox_info,
+				pf_uc_mac_addr[offset]) + i);
+	}
+}
+
+void dn200_get_func_mac_addr(struct mac_device_info *hw, u8 offset,
+			     struct mac_addr_route *mac_addr)
+{
+	int i = 0;
+
+	for (; i < sizeof(struct mac_addr_route); i++) {
+		*((u8 *) mac_addr + i) =
+		    readb(LRAM_MAC_PF_OFFSET(hw) +
+			  offsetof(struct dn200_mailbox_info,
+				   mac_addr[offset]) + i);
+	}
+}
+
+void dn200_set_func_mac_addr(struct mac_device_info *hw, u8 offset,
+			     struct mac_addr_route *mac_addr)
+{
+	int i = 0;
+
+	for (; i < sizeof(struct mac_addr_route); i++) {
+		writeb(*((u8 *) mac_addr + i),
+		       LRAM_MAC_PF_OFFSET(hw) +
+		       offsetof(struct dn200_mailbox_info,
+				mac_addr[offset]) + i);
+	}
+}
+
+u16 dn200_get_vxlan_status(struct mac_device_info *hw)
+{
+	u16 data;
+
+	data =
+	    readw(LRAM_MAC_PF_OFFSET(hw) +
+		  offsetof(struct dn200_mailbox_info, vxlan_status));
+	return data;
+}
+
+void dn200_set_vxlan_status(struct mac_device_info *hw, u16 vxlan_status)
+{
+	writew(vxlan_status,
+	       LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info,
+						 vxlan_status));
+}
+
+void dn200_reset_lram_rxp_async_info(struct mac_device_info *hw)
+{
+	int i = 0;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_info);
+
+	for (; i < info_size; i++)
+		writeb(0, LRAM_VF_RXP_INFO_OFFSET(hw, DN200_VF_OFFSET_GET(hw)) + i);
+}
+
+void dn200_set_lram_rxp_async_info(struct mac_device_info *hw, u8 *info)
+{
+	int i = 0;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_info);
+
+	for (; i < info_size; i++)
+		writeb(*((u8 *)info + i), LRAM_VF_RXP_INFO_OFFSET(hw, DN200_VF_OFFSET_GET(hw)) + i);
+}
+
+void dn200_get_lram_rxp_async_info(struct mac_device_info *hw, u8 *info, u8 vf_num)
+{
+	int i = 0;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_info);
+
+	for (; i < info_size; i++)
+		*(info + i) = readb(LRAM_VF_RXP_INFO_OFFSET(hw, (vf_num)) + i);
+}
+
+void dn200_get_lram_rxp_async_crc32(struct mac_device_info *hw, u8 vf_num, u8 *info)
+{
+	int i = 0;
+	size_t info_size = sizeof(u32);
+
+	for (; i < info_size; i++)
+		*(info + i) = readb(LRAM_VF_RXP_INFO_OFFSET(hw, (vf_num)) + i);
+}
+
+void dn200_set_lram_rxp_wb_info(struct mac_device_info *hw, u8 *info)
+{
+	int i = 0;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_wb);
+
+	for (; i < info_size; i++)
+		writeb(*((u8 *)info + i), LRAM_VF_RXP_WB_INFO_OFFSET(hw, DN200_VF_OFFSET_GET(hw)) + i);
+}
+
+void dn200_get_lram_rxp_wb_info(struct mac_device_info *hw, u8 *info, u8 vf_num)
+{
+	int i = 0;
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_wb);
+
+	for (; i < info_size; i++)
+		*(info + i) = readb(LRAM_VF_RXP_WB_INFO_OFFSET(hw, (vf_num)) + i);
+}
+/**
+ *  dn200_hw_lock - set gloabl spin lock in firmware
+ *  @hw: hw mac device information
+ *  @is_locked: used as input and output, if it is true just return; if false get the lock
+ *  Description: this function is used by lram & share table(e.g. rxp),
+ *  it will set global spin lock in firmare,
+ *  if suceess, output var is_locked will be true, and return 0
+ *  if failure, output var is_locked will be false, and return err code
+ */
+int dn200_hw_lock(struct mac_device_info *hw, bool *is_locked)
+{
+	int ret = 0;
+	u32 retry = 0;
+	u32 ret_val = 0;
+	struct dn200_priv *priv = hw->priv;
+
+	/* when vf flow is stoped, don't allow to set rxp */
+	if (HW_IS_VF(hw) && test_bit(DN200_VF_IN_STOP, &priv->state))
+		return -1;
+
+#define MAX_SCHED_TIMES 10
+#define MAX_RETRY_TIMES 10000
+	if (*is_locked)
+		return 0;
+
+	while (retry++ < MAX_RETRY_TIMES) {
+		ret = lram_and_rxp_lock_and_unlock(&priv->plat_ex->ctrl,
+						 BIT(0), &ret_val);
+		if (ret) {
+			netdev_err(priv->dev,
+				   "get lock fail, for cq not return\n");
+			return ret;
+		}
+		if (ret_val && ret_val == 1) {
+			udelay(5);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	if (retry > MAX_RETRY_TIMES) {
+		priv->swc.hw_lock_timeout++;
+		ret = -1;
+	}
+
+	if (!ret) {
+		*is_locked = true;
+		priv->swc.hw_lock_recfgs = 0;
+	} else {
+		*is_locked = false;
+		priv->swc.hw_lock_fail_cnt++;
+		/* just reconfig limit times, stop schedule the work when always lock failure */
+		if (++priv->swc.hw_lock_recfgs < MAX_SCHED_TIMES)
+			queue_work(priv->wq, &priv->reconfig_task);
+	}
+	return ret;
+}
+
+/**
+ *  dn200_hw_unlock - free the gloabl spin lock in firmware
+ *  @hw: hw mac device information
+ *  @is_locked: used as input and output, if it is true need to unlock; if false do nothing
+ *  Description: this function is used by lram & share table(e.g. rxp),
+ *  it will free global spin lock in firmare,
+ *  if suceess, output var is_locked will be false
+ *  if failure, output var is_locked will be unchanged
+ */
+void dn200_hw_unlock(struct mac_device_info *hw, bool *is_locked)
+{
+	u32 ret_val = 0;
+	int ret = 0;
+
+	if (*is_locked) {
+		ret = lram_and_rxp_lock_and_unlock(&hw->priv->plat_ex->ctrl, 0, &ret_val);
+		if (ret_val)
+			netdev_err(hw->priv->dev, "lram and rxp unlock failed\n");
+
+		if (ret)
+			netdev_err(hw->priv->dev,
+				"unlock failed, for cq not return func =%s, line = %d\n",
+				__func__, __LINE__);
+
+		if ((ret == 0) && (ret_val == 0))
+			*is_locked = false;
+	}
+}
+
+/* idx : l3l4_entry idx
+ */
+int dn200_get_l3l4_filter_offset(struct mac_device_info *hw, int idx)
+{
+	int i = 0;
+	union l3l4_info_t l3l4_info;
+	u32 funcid = hw->priv->plat_ex->vf_offset + HW_IS_VF(hw) + 1;
+
+	for (; i < 32; i++) {
+		l3l4_info.l3l4_info =
+		    readl(LRAM_MAC_PF_OFFSET(hw) +
+			  offsetof(struct dn200_mailbox_info,
+				   l3l4_info) + i * sizeof(u32));
+		if (l3l4_info.funcid == funcid && l3l4_info.entry_idx == idx)
+			return i;
+	}
+	return -1;
+}
+
+int dn200_set_l3l4_filter_info(struct mac_device_info *hw, int idx, int offset,
+			       bool clear)
+{
+	union l3l4_info_t l3l4_info;
+	u32 funcid = hw->priv->plat_ex->vf_offset + HW_IS_VF(hw) + 1;
+
+	if (clear) {
+		l3l4_info.l3l4_info = 0;
+	} else {
+		l3l4_info.funcid = funcid;
+		l3l4_info.entry_idx = idx;
+		l3l4_info.offset = offset;
+	}
+	writel(l3l4_info.l3l4_info,
+	       LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info,
+						 l3l4_info) +
+	       offset * sizeof(u32));
+	return 0;
+}
+
+u8 dn200_vf_get_link_status(struct mac_device_info *hw)
+{
+	return readb(LRAM_MAC_PF_OFFSET(hw) +
+		     offsetof(struct dn200_mailbox_info, link_status));
+}
+
+void dn200_pf_set_link_status(struct mac_device_info *hw, u8 link_status)
+{
+	writeb(link_status,
+	       LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info,
+						 link_status));
+}
+
+static void dn200_clear_vf_queue_info(struct dn200_priv *priv)
+{
+	u8 queue_info_size = sizeof(struct dn200_vf_info) / sizeof(u32);
+	int i, k;
+
+	for (i = 0; i < DN200_MAX_VF_NUM; i++) {
+		for (k = 0; k < queue_info_size; k++) {
+			writel(0,
+			       LRAM_QUEUE_PF_OFFSET(priv->hw) +
+			       offsetof(struct dn200_sriov_queue_info,
+					vf_info[i]) + k * sizeof(u32));
+		}
+	}
+}
+
+static void dn200_set_vf_queue_info(struct dn200_priv *priv)
+{
+	u8 queue_info_size = sizeof(struct dn200_vf_info) / sizeof(u32);
+	int i, k;
+	u32 value;
+
+	for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++) {
+		for (k = 0; k < queue_info_size; k++) {
+			value = *((u32 *) (&priv->plat_ex->pf.vfs[i]) + k);
+			writel(value,
+			       LRAM_QUEUE_PF_OFFSET(priv->hw) +
+			       offsetof(struct dn200_sriov_queue_info,
+					vf_info[i]) + k * sizeof(u32));
+		}
+	}
+}
+
+void dn200_get_vf_queue_info(void __iomem *mailbox, struct dn200_vf_info *info,
+			     u8 pf_id, u8 funcid)
+{
+	u8 queue_info_size = sizeof(struct dn200_vf_info) / sizeof(u32);
+	int k;
+	u32 *value;
+
+	for (k = 0; k < queue_info_size; k++) {
+		value = (u32 *) (info) + k;
+
+		*value =
+		    readl(mailbox + DN200_SRIOV_QUEUE_OFFSET +
+			  pf_id * SRIOV_MAX_SIZE_PER_PF +
+			  offsetof(struct dn200_sriov_queue_info,
+				   vf_info[funcid]) + k * sizeof(u32));
+	}
+}
+
+void dn200_set_phy_info(struct mac_device_info *hw,
+			struct dn200_sriov_phy_info *info)
+{
+	u8 phy_info_size = sizeof(struct dn200_sriov_phy_info) / sizeof(u32);
+	int k;
+	u32 value;
+
+	for (k = 0; k < phy_info_size; k++) {
+		value = *((u32 *) (info) + k);
+		writel(value,
+		       LRAM_MAC_PF_OFFSET(hw) +
+		       offsetof(struct dn200_mailbox_info,
+				sriov_phy_info) + k * sizeof(u32));
+	}
+}
+
+void dn200_get_phy_info(struct mac_device_info *hw,
+			struct dn200_sriov_phy_info *info)
+{
+	u8 phy_info_size = sizeof(struct dn200_sriov_phy_info) / sizeof(u32);
+	int k;
+	u32 *value;
+
+	for (k = 0; k < phy_info_size; k++) {
+		value = (u32 *) (info) + k;
+		*value =
+		    readl(LRAM_MAC_PF_OFFSET(hw) +
+			  offsetof(struct dn200_mailbox_info,
+				   sriov_phy_info) + k * sizeof(u32));
+	}
+}
+
+void dn200_sriov_mail_init(struct dn200_priv *priv)
+{
+	int k;
+	int info_size = sizeof(struct dn200_mailbox_info) / sizeof(u32);
+
+	if (!PRIV_IS_VF(priv)) {
+		for (k = 0; k < info_size; k++)
+			writel(0, LRAM_MAC_PF_OFFSET(priv->hw) + k * sizeof(u32));
+	}
+}
+
+void dn200_sriov_ver_get(struct dn200_priv *priv, struct dn200_ver *ver_info)
+{
+	int k;
+	int info_size = sizeof(struct dn200_ver) / sizeof(u32);
+	u32 *value;
+
+	for (k = 0; k < info_size; k++) {
+		value = ((u32 *) (ver_info) + k);
+		*value = readl(LRAM_VER_PF_OFFSET(priv->hw) +
+				offsetof(struct dn200_sriov_ver_info,
+				   dn200_ver_info) + k * sizeof(u32));
+	}
+}
+
+void dn200_sriov_ver_set(struct dn200_priv *priv, struct dn200_ver *ver_info)
+{
+	int k;
+	int info_size = sizeof(struct dn200_ver) / sizeof(u32);
+	u32 value;
+
+	for (k = 0; k < info_size; k++) {
+		value = *((u32 *) (ver_info) + k);
+		writel(value, LRAM_VER_PF_OFFSET(priv->hw) +
+				offsetof(struct dn200_sriov_ver_info,
+				   dn200_ver_info) + k * sizeof(u32));
+	}
+}
+
+void dn200_sriov_static_init(struct dn200_priv *priv)
+{
+	int k;
+	int info_size = sizeof(struct dn200_sriov_mbx_info) / sizeof(u32);
+
+	if (!PRIV_IS_VF(priv)) {
+		for (k = 0; k < info_size; k++)
+			writel(0, LRAM_MBX_PF_OFFSET(priv->hw) + k * sizeof(u32));
+
+		DN200_SET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, 0);
+		DN200_SET_LRAM_UPGRADE_PF_FINISH(priv->hw, 0, priv->plat_ex->pf_id);
+	}
+}
+
+static int dn200_assign_vf_resources(struct dn200_priv *priv, int num_vfs)
+{
+	u8 iatu_per_vf;
+	int free_iatu;
+	u8 pf_id = priv->plat_ex->pf_id;
+	int num_txq, num_rxq;
+	int i = 0;
+	struct plat_dn200_data *plat_ex = priv->plat_ex;
+	struct dn200_vf_info *vfs;
+	u8 qps_per_vf;
+	u8 tx_queue_num =
+	    plat_ex->tx_queues_total - plat_ex->tx_queues_reserved;
+	u8 rx_queue_num =
+	    plat_ex->rx_queues_total - plat_ex->rx_queues_reserved;
+	u8 queue_num = min(tx_queue_num, rx_queue_num);
+
+	vfs = kcalloc(num_vfs, sizeof(struct dn200_vf_info), GFP_KERNEL);
+	if (!vfs)
+		return -ENOMEM;
+
+	priv->plat_ex->pf.vfs = vfs;
+	num_txq = plat_ex->tx_queues_total - queue_num;
+	num_rxq = plat_ex->rx_queues_total - queue_num;
+	qps_per_vf = min_t(int, num_txq / num_vfs, num_rxq / num_vfs);
+	qps_per_vf = min_t(int, DN200_MAX_QPS_PER_VF, qps_per_vf);
+	priv->plat_ex->pf.registered_vfs = num_vfs;
+	/*vlan 0 reserved, can not be use */
+	iatu_per_vf = priv->plat_ex->vf_total_iatu[pf_id] / num_vfs;
+	free_iatu = priv->plat_ex->vf_total_iatu[pf_id] % num_vfs;
+	for (i = 0; i < num_vfs; i++) {
+		priv->plat_ex->pf.vfs[i].rx_queue_start =
+		    queue_num + i * qps_per_vf;
+		priv->plat_ex->pf.vfs[i].tx_queue_start =
+		    queue_num + i * qps_per_vf;
+		priv->plat_ex->pf.vfs[i].rx_queues_num = qps_per_vf;
+		priv->plat_ex->pf.vfs[i].tx_queues_num = qps_per_vf;
+		priv->plat_ex->pf.vfs[i].max_vfs = plat_ex->max_vfs;
+		priv->plat_ex->pf.vfs[i].registered_vfs = num_vfs;
+		priv->plat_ex->pf.vfs[i].iatu_num = iatu_per_vf;
+		priv->plat_ex->pf.vfs[i].max_vlan_num = 0;
+		if (free_iatu-- > 0)
+			priv->plat_ex->pf.vfs[i].iatu_num++;
+	}
+
+	dn200_set_vf_queue_info(priv);
+	dn200_sriov_reconfig_hw_feature(priv, &priv->dma_cap);
+	return 0;
+}
+
+static int dn200_free_vf_resources(struct dn200_priv *priv)
+{
+	dn200_clear_vf_queue_info(priv);
+	kfree(priv->plat_ex->pf.vfs);
+	priv->plat_ex->pf.vfs = NULL;
+	priv->plat_ex->pf.registered_vfs = 0;
+	dn200_sriov_reconfig_hw_feature(priv, &priv->dma_cap);
+	return 0;
+}
+
+int dn200_datapath_close(struct dn200_priv *priv)
+{
+	int ret = 0;
+	u8 flow_state;
+
+	/* if netdev is not running, no need to close dataptath */
+	if (!netif_running(priv->dev))
+		return 0;
+
+	/* stop mac rx */
+	dn200_mac_rx_set(priv, priv->ioaddr, false);
+	/* clean all tx & rx queues */
+	ret = dn200_clean_all_tx_queues(priv, priv->plat->tx_queues_to_use);
+	if (PRIV_IS_VF(priv)) {
+		/* pf to vf, rx clean is no need*/
+		DN200_ITR_SYNC_GET(priv->hw, vf_flow_state_event,
+				DN200_VF_OFFSET_GET(priv->hw), &flow_state);
+
+		if (flow_state == FLOW_CLOSE_START)
+			return 0;
+		if (test_bit(DN200_VF_FLOW_CLOSE, &priv->state))
+			return 0;
+	}
+	if (ret == 0)
+		ret = dn200_clean_all_rx_queues(priv);
+	/* clear tx queues or mtl fifo */
+	return ret;
+}
+
+void dn200_datapath_open(struct dn200_priv *priv)
+{
+	/* if netdev is not running, no need to open dataptath */
+	if (!netif_running(priv->dev))
+		return;
+
+	if (priv->plat_ex->phy_info->link_status)
+		netif_carrier_on(priv->dev);
+	netif_tx_start_all_queues(priv->dev);
+
+	/* start mac rx */
+	dn200_mac_rx_set(priv, priv->ioaddr, true);
+}
+
+int dn200_sriov_enable(struct dn200_priv *priv, int num_vfs)
+{
+	int rc = 0;
+	bool dev_running = netif_running(priv->dev);
+
+	if (num_vfs > priv->plat_ex->pf.max_vfs) {
+		netdev_err(priv->dev,
+			   "Can't enable %d VFs, max VFs supported is %d\n",
+			   num_vfs, priv->plat_ex->pf.max_vfs);
+		return -EOPNOTSUPP;
+	}
+
+	netdev_info(priv->dev, "Enabling %d VFs\n", num_vfs);
+	/* close tx & rx datapath before assign resources for vfs that will cause stop flow */
+	if (dev_running)
+		dn200_datapath_close(priv);
+
+	rc = dn200_assign_vf_resources(priv, num_vfs);
+	if (rc)
+		goto dev_open;
+
+	rc = pci_enable_sriov(to_pci_dev(priv->device), num_vfs);
+	if (rc)
+		goto err_out1;
+	priv->plat_ex->pf.active_vfs = num_vfs;
+	priv->plat_ex->sriov_cfg = true;
+
+	goto dev_open;
+
+err_out1:
+	priv->plat_ex->pf.registered_vfs = 0;
+	dn200_free_vf_resources(priv);
+dev_open:
+	if (dev_running)
+		dn200_datapath_open(priv);
+
+	return rc;
+}
+
+void dn200_vf_flow_close(struct dn200_priv *priv)
+{
+	if (!priv->plat_ex->sriov_cfg)
+		return;
+	dn200_all_vf_flow_state_clear(priv);
+	dn200_vf_flow_state_set(priv, FLOW_CLOSE_START);
+	dn200_all_vf_flow_state_wait(priv, FLOW_CLOSE_DONE, false);
+}
+
+void _dn200_vf_flow_open(struct dn200_priv *priv)
+{
+	set_bit(DN200_VF_FLOW_OPEN, &priv->state);
+	queue_work(priv->wq, &priv->service_task);
+}
+
+void dn200_vf_flow_open(struct dn200_priv *priv)
+{
+	if (!priv->plat_ex->sriov_cfg)
+		return;
+	dn200_all_vf_flow_state_clear(priv);
+	dn200_vf_flow_state_set(priv, FLOW_OPEN_START);
+	dn200_all_vf_flow_state_wait(priv, FLOW_OPEN_DONE, true);
+	clear_bit(DN200_VF_FLOW_OPEN, &priv->state);
+}
+
+int dn200_sriov_disable(struct dn200_priv *priv)
+{
+	u16 num_vfs = pci_num_vf(to_pci_dev(priv->device));
+
+	if (!num_vfs)
+		return 0;
+	if (pci_vfs_assigned(to_pci_dev(priv->device))) {
+		netdev_warn(priv->dev,
+			    "Unable to free %d VFs because some are assigned to VMs.\n",
+			    num_vfs);
+		return -EPERM;
+	}
+	/* wait all vfs to stop dev before disable sriov */
+	pci_disable_sriov(to_pci_dev(priv->device));
+
+	/* close datapath before free resources that will cause stop flow */
+	dn200_datapath_close(priv);
+	dn200_free_vf_resources(priv);
+	/* open datapath */
+	dn200_datapath_open(priv);
+	priv->plat_ex->sriov_cfg = false;
+	return 0;
+}
+
+int dn200_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct dn200_priv *priv = netdev_priv(dev);
+	int ret = 0;
+	u32 flag = 0;
+
+	if (test_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state)  ||
+			test_bit(DN200_PCIE_UNAVAILD, &priv->state))
+		return -EIO;
+	if (!dn200_hwif_id_check(priv->ioaddr)) {
+		netdev_err(priv->dev, "%s: %s\n", __func__, DN200_PCIE_BAR_ERR);
+		set_bit(ADMIN_QUEUE_CQ_NO_RETURN, &priv->plat_ex->ctrl.admin_state);
+		set_bit(DN200_PCIE_UNAVAILD, &priv->state);
+		dn200_global_err(priv, DN200_PCIE_UNAVAILD_ERR);
+		return -EIO;
+	}
+
+	if (!PRIV_SRIOV_SUPPORT(priv)) {
+		netdev_err(dev,
+			   "Reject SRIOV config request since device is not supported!\n");
+		return -EOPNOTSUPP;
+	}
+
+	DN200_GET_LRAM_UPGRADE_MEMBER(priv->hw, upgrade_flag, &flag);
+	if (flag) {
+		netdev_err(dev,
+			   "Reject echo vfs after update fw!\n");
+		return -EOPNOTSUPP;
+	}
+
+	priv->plat_ex->vf_flag = true;
+	priv->plat_ex->pf.max_vfs = priv->plat_ex->max_vfs;
+	if (pci_vfs_assigned(pdev)) {
+		netdev_warn(dev,
+			    "Unable to configure SRIOV since some VFs are assigned to VMs.\n");
+		priv->plat_ex->vf_flag = false;
+		return -EPERM;
+	}
+
+	/* if there are previous existing VFs, clean them up */
+	dn200_sriov_disable(priv);
+	if (!num_vfs)
+		goto sriov_cfg_exit;
+
+	ret = dn200_sriov_enable(priv, num_vfs);
+	if (ret) {
+		netdev_err(dev, "Sriov enable failed, error %d\n", ret);
+		priv->plat_ex->vf_flag = false;
+		return ret;
+	}
+
+sriov_cfg_exit:
+	priv->plat_ex->vf_flag = false;
+	return num_vfs;
+}
+
+int dn200_vf_glb_err_rst_notify(struct dn200_priv *priv)
+{
+	if (!PRIV_IS_VF(priv))
+		return -EINVAL;
+
+	/* for sriov vf noitfy pf */
+	DN200_ITR_SYNC_SET(priv->hw, reset_event, VF2PF_ERR_RST_NOTIFY, 1);
+	priv->xstats.rst_start_count++;
+	dev_dbg(priv->device, "%s, %d, vf to pf err rst request.\n",
+		__func__, __LINE__);
+
+	DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, HW_RESET_ID, 1);
+	irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+	return 0;
+}
+
+int dn200_pf_glb_err_rst_process(struct dn200_priv *priv)
+{
+	int ret = 0;
+	u8 reset_event = 0;
+
+	if (PRIV_IS_VF(priv))
+		return 0;
+
+	/* pf process the err reset request from vf */
+	if (PRIV_SRIOV_SUPPORT(priv)) {
+		DN200_ITR_SYNC_GET(priv->hw, reset_event, VF2PF_ERR_RST_NOTIFY,
+				   &reset_event);
+		if (reset_event) {
+			priv->xstats.rst_start_accept_count++;
+			dev_dbg(priv->device,
+				"%s, %d, pf received vf err rst request.\n",
+				__func__, __LINE__);
+			dn200_global_err(priv, DN200_VF_TO_PF);
+			/* received vf to pf err reset request, clear it */
+			DN200_ITR_SYNC_SET(priv->hw, reset_event,
+					   VF2PF_ERR_RST_NOTIFY, 0);
+		}
+	}
+
+	return ret;
+}
+
+void dn200_vf_flow_state_set(struct dn200_priv *priv, u8 flow_sate)
+{
+	int i = 0;
+
+	for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++)
+		DN200_ITR_SYNC_SET(priv->hw, vf_flow_state_event, i, flow_sate);
+
+	DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, FLOW_STATE_ID, 1);
+
+	irq_peer_notify(priv->plat_ex->pdev, &priv->plat_ex->ctrl);
+}
+
+bool dn200_all_vf_flow_state_wait(struct dn200_priv *priv, u8 wait_state, bool in_task)
+{
+	int i, retries = 20;	/* max retry 10times, 1s */
+	u8 vf_flow_state = 0;
+	u8 reg_info;
+	bool all_fun_state;
+
+	while (retries-- > 0) {
+		all_fun_state = true;
+		for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++) {
+			DN200_HEARTBEAT_GET(priv->hw, registered_vf_state, i,
+					    &reg_info);
+			if (!((reg_info & DN200_VF_REG_STATE_OPENED)
+			     || (reg_info & DN200_VF_REG_STATE_IN_RST))) {
+				/*vf not register now, skip it */
+				continue;
+			}
+			DN200_ITR_SYNC_GET(priv->hw, vf_flow_state_event, i,
+					   &vf_flow_state);
+			dev_dbg(priv->device,
+				"%s, %d, retry %d, vf:%d, vf_flow_state:%d, wait_state:%d, reg_info:%d",
+				__func__, __LINE__, 20 - retries, i,
+				vf_flow_state, wait_state, reg_info);
+			if (vf_flow_state != wait_state) {
+				all_fun_state = false;
+				break;
+			}
+		}
+
+		if (all_fun_state) {
+			dev_dbg(priv->device,
+				"%s, %d, retry %d to wait all vfs flow state:%d",
+				__func__, __LINE__, 20 - retries,
+				vf_flow_state);
+			return true;
+		}
+
+		/* Sleep then retry */
+		if (in_task)
+			usleep_range(100000, 110000);
+		else
+			msleep(100);
+	}
+	dev_err(priv->device,
+		"%s, %d, retry timeout to wait all vfs flow state", __func__,
+		__LINE__);
+	return false;
+}
+
+void dn200_all_vf_flow_state_clear(struct dn200_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->plat_ex->pf.registered_vfs; i++)
+		DN200_ITR_SYNC_SET(priv->hw, vf_flow_state_event, i, 0);
+
+	DN200_ITR_SYNC_SET(priv->hw, itr_sync_app, FLOW_STATE_ID, 0);
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dn200_sriov.h b/drivers/net/ethernet/dapustor/dn200/dn200_sriov.h
new file mode 100644
index 000000000000..0aa3dce3e43d
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dn200_sriov.h
@@ -0,0 +1,571 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ * Author: Guo Feng <guofeng@dapustor.com>
+ *
+ * Config dn200 sriov
+ */
+
+#ifndef _DN200_SRIOV_H
+#define _DN200_SRIOV_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/ethtool.h>
+#include "common.h"
+#include "dn200_ctrl.h"
+#define DN200_MAX_VF_NUM		8
+#define DN200_MAX_QPS_PER_VF	1
+#define DN200_MAX_VF_LOSS_HB_CNT 120
+#define LRAM_UPGRADE_OFFSET 0x10000
+#define HW_VF_NUM(hw) ((hw)->priv->plat_ex->pf.registered_vfs)
+
+/******************
+ * rxp_entry		used_for			comment
+ * 0-1				BC					broadcast_macaddr
+ * 2-34				PF_UC			    PF owns 11 uc mac_addr
+ * ENTRY 0: UC first
+ * ENTRY 1: UC SECOND
+ * ENTRY 2: VLAN　ENTRY
+ *
+ * 35-41			PF_vlan				PF has Five fixed vlan_ids
+ * 42-57			VF_UC				VF has only one uc mac_addr
+ * 58-123			MC					PF VF own MC
+ * 124				ALL Multicast
+ * 125				PROMISC
+ * 126				all drop			all drop entry(for vlan filter)
+ * 127				all bypass			all bypass entry(for rss)
+ *
+ */
+#define DN200_PF_SELF_UC_NUM	(1)
+#define DN200_PF_OTHER_UC_NUM	(15)
+#define DN200_PF_VLAN_ENTRY_NUM	(7)
+#define DN200_VF_UC_OFF	(1 + DN200_PF_SELF_UC_NUM + DN200_PF_OTHER_UC_NUM)
+#define DN200_VF_UC_NUM	(8)
+#define DN200_PFVF_MC_NUM (33)
+#define DN200_BC_RXP_OFF	(1 * 2 + 1 + DN200_PF_VLAN_ENTRY_NUM + (DN200_PF_SELF_UC_NUM + DN200_PF_OTHER_UC_NUM + DN200_VF_UC_NUM + DN200_PFVF_MC_NUM) * 2 + 1 + 1 + 1)
+#define DN200_ALL_DROP_OFF		(DN200_BC_RXP_OFF - 1)
+#define DN200_MAX_USED_RXP_NUM	(DN200_BC_RXP_OFF + 1)
+#define DN200_ALL_PROMISC_OFF		(DN200_ALL_DROP_OFF - 1)
+#define DN200_ALL_MULTCAST_OFF		(DN200_ALL_PROMISC_OFF - 1)
+#define DN200_MAX_UC_MAC_ADDR_NUM	15
+#define DN200_PF_UC_ADDR_START	2
+#define DN200_PF_UC_ADDR_END	16
+#define DN200_MC_ADDR_START	(1 + DN200_PF_SELF_UC_NUM + DN200_PF_OTHER_UC_NUM + 8)
+#define DN200_MC_ADDR_END	(33 + 1 + DN200_PF_SELF_UC_NUM + DN200_PF_OTHER_UC_NUM + 8 - 1)
+
+#define DN200_VLAN_ADDR_START	(1 * 2 + 2 * 16 + 1)
+
+#define DMA_CHA_NO_OFFSET(id)	((id) * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) + offsetof(struct RXP_FPR_ENTRY, dma_ch_no) / sizeof(u32))
+#define OK_INDEX_ENTRY_OFFSET(id)	((id) * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) + offsetof(struct RXP_FPR_ENTRY, ok_index) / sizeof(u32))
+#define AFRFNC_ENTRY_OFFSET(id)	((id) * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) + 2)
+#define OK_INDEX_MASK GENMASK(23, 16)
+#define OK_INDEX_OFFSET 16
+#define AF_ENABLE BIT(0)
+#define RF_ENABLE BIT(1)
+#define IM_ENABLE BIT(2)
+#define NC_ENABLE BIT(3)
+
+#define DA_DMA_CHA_NO_OFFSET_AMPM(id)	DMA_CHA_NO_OFFSET((id))
+#define DA_DMA_CHA_NO_OFFSET(id)	DMA_CHA_NO_OFFSET(((id) + 1))
+#define DA_OK_INDEX_FIRST_ENTRY_OFFSET(id)	OK_INDEX_ENTRY_OFFSET((id))
+#define DA_OK_INDEX_SEC_ENTRY_OFFSET(id)	OK_INDEX_ENTRY_OFFSET(((id) + 1))
+#define DN200_MAX_TC_ENTRY_NUM	16
+
+#define DN200_VLAN_NUM 6
+
+#define DN200_FIFO_SIZE_CHANGE_SIGN 0x14
+
+/*SRIOV PF q0 tx fifo 11.5K, other q 3.5K*/
+#define DN200_SRIOV_Q0_TX_FIFO_SIZE	11776
+/*PUREPF PF q0 tx fifo 16K other 4K*/
+#define DN200_PURE_Q0_TX_FIFO_SIZE	15360
+#define DN200_OTHER_Q_TX_FIFO_SIZE	5120
+#define DN200_SRIOV_01_TX_FIFO_SIZE	3584
+#define DN200_PURE_01_TX_FIFO_SIZE	7168
+#define DN200_01_VF_RX_FIFO_SIZE	16384
+#define DN200_RX_TC_FIFO_SIZE		11264 /* 11KB */
+#define DN200_RX_SUPER_FIFO_SIZE	20480 /* 20KB */
+#define DN200_RX_TC_FIFO_SIZE_1G	12800 /* 12.5KB */
+#define DN200_RX_SUPER_FIFO_SIZE_1G	14366 /* 14KB */
+#define DN200_PURE_RX_TC_FIFO_SIZE	16384 /* 16KB */
+#define DN200_PURE_RX_SUPER_FIFO_SIZE	0 /* 0 */
+#define DN200_RX_UNTAG_PKT_FIFO_INDEX	0
+#define DN200_TX_FIFO_SIZE_PORT2_3 16384
+#define DN200_RX_FIFO_SIZE_PORT2_3 16384
+
+#define DN200_23_VF_FIFO_SIZE		16384
+#define DN200_01_MAX_FIFO_SIZE		65536
+#define DN200_23_MAX_FIFO_SIZE		32768
+
+#define PRIV_SRIOV_SUPPORT(priv) ((priv)->plat_ex->sriov_supported)
+#define PRIV_NVME_SUPPORT(priv) ((priv)->plat_ex->nvme_supported)
+#define DN200_VF_OFFSET_GET(hw) ((hw)->priv->plat_ex->vf_offset)
+struct mac_addr_route {
+	u8 mac_addr[ETH_ALEN];
+	u16 channel;
+	int rxp_offset;
+} __aligned(8);
+
+struct dn200_vf_info {
+	u8 rx_queue_start;
+	u8 tx_queue_start;
+	u8 rx_queues_num;
+	u8 tx_queues_num;
+	u8 max_vfs;
+	u8 registered_vfs;
+	u16 max_vlan_num;
+	u8 iatu_num;
+}  __aligned(8);
+
+struct dn200_pf_info {
+	u16	active_vfs;
+	u16	registered_vfs;
+	u16	max_vfs;
+	u16 vlan_num_per_vf;
+	void __iomem *ioaddr;
+	void __iomem *ctrl_addr;
+	unsigned long	*vf_event_bmap;
+	struct dn200_vf_info *vfs;
+};
+
+struct dn200_sriov_phy_info {
+	u8 media_type;
+	u8 an;
+	u8 pause;
+	u8 phy_interface;
+	/*link status*/
+	int speed;
+	u8 dup;
+	u16 link_modes;
+} __aligned(8);
+
+union l3l4_info_t {
+	struct {
+		u32 funcid:4;
+		u32 entry_idx:5;
+		u32 offset:5;
+	};
+	u32 l3l4_info;
+};
+
+struct RXP_FPR_ENTRY {
+	u32 match_data;
+	u32 match_en;
+	u8 af:1;
+	u8 rf:1;
+	u8 im:1;
+	u8 nc:1;
+	u8 res1:4;
+	u8 frame_offset:6;
+	u8 res2:2;
+	u8 ok_index;
+	u8 giv:1;
+	u8 gid:3;
+	u8 res3:4;
+	u16 dma_ch_no;
+	u16 res4;
+} __packed;
+
+/* apps use hw interrupt to sync with peer(pf or vf)*/
+enum ITR_SYNC_APP_ID {
+	HW_RESET_ID,
+	FLOW_STATE_ID,
+	RXP_TASK,
+
+	/* MAX_APP_ID is last one, put others before it */
+	MAX_APP_ID
+};
+
+enum HW_RESET_EVENT {
+	VF2PF_ERR_RST_NOTIFY = 0,
+	/* MAX_RESET_EVENT is last one, put others before it */
+	MAX_RESET_EVENT
+};
+
+enum dn200_lram_lock {
+	DN200_LRAM_LOCK,
+	DN200_UC_LRAM_LOCK,
+	DN200_MC_LRAM_LOCK,
+	DN200_L3L4_LRAM_LOCK,
+	DN200_BC_LRAM_LOCK,
+};
+
+enum VF_FLOW_STATE_EVENT {
+	FLOW_OPEN_START = 1,
+	FLOW_OPEN_DONE,
+	FLOW_CLOSE_START,
+	FLOW_CLOSE_DONE,
+};
+
+struct dn200_itr_sync {
+	/* all app (e.g. hw reset) which use hw interrupt to sync between pf & vf
+	 * one byte represet one event, 1: app exist 0: not exist
+	 */
+	u8 itr_sync_app[MAX_APP_ID];
+	/* e.g. vf notify pf to do reset; or pf notify vf to do reset,
+	 * one byte represet one event, 1: event exist 0: not exist
+	 */
+	u8 reset_event[MAX_RESET_EVENT];
+	/* record vf function that have done reset
+	 * one byte represet one func, 1: fun done reset 0: not done
+	 */
+	u8 reset_func_list[DN200_MAX_VF_NUM + 1];
+	u8 vf_flow_state_event[DN200_MAX_VF_NUM];
+	u8 vf_reset_mac_list[DN200_MAX_VF_NUM];
+	u8 vf_link_list[DN200_MAX_VF_NUM];
+	u8 vf_carrier[DN200_MAX_VF_NUM];
+	u8 vf_probe[DN200_MAX_VF_NUM];
+	u8 pf_carrier;
+} __aligned(8);
+
+enum DN200_REG_VF_STATE {
+	DN200_VF_REG_STATE_NONE = 0,
+	DN200_VF_REG_STATE_OPENED = BIT(0),
+	DN200_VF_REG_STATE_IN_RST = BIT(1),
+};
+
+struct dn200_heartbeat_info {
+	/*vf should do : heatbeat = heatbeat ^ last_heatbeat*/
+	u8 heartbeat[DN200_MAX_VF_NUM + 1];		/*registered_vf_state set headbeat */
+	/*Pf should check last_heatbeat != heatbeat && set last_heatbeat = heatbeat */
+	u8 last_heartbeat[DN200_MAX_VF_NUM + 1];
+	/* all probed vf should set registered_vf_state to true,
+	 * PF will clear this when vf's rxp reset
+	 * VF should check this to judge whether itself has been reset by PF
+	 */
+	u8 registered_vf_state[DN200_MAX_VF_NUM + 1];
+};
+
+/* ===================================
+ * SRIOV LRAM MAILBOX STRUCT INFO
+ */
+
+#define SRIOV_MAX_SIZE_PER_PF 4096
+
+#define DN200_SRIOV_MAC_OFFSET 0x0
+//store mac addr info to mailbox bar mem
+struct dn200_mailbox_info {
+
+	unsigned long bitmap_rxp;
+	unsigned long bitmap_uc;
+	unsigned long bitmap_l3l4;
+	unsigned long bitmap_mac;
+	u8 link_status;
+	u8 vf_link_notify[DN200_MAX_VF_NUM];
+	u16 vxlan_status;
+	u8 uc_mac_addr[DN200_MAX_VF_NUM][ETH_ALEN];
+	struct dn200_itr_sync itr_sync_info;
+	struct dn200_sriov_phy_info sriov_phy_info;
+	struct mac_addr_route mac_addr[DN200_MAX_MC_ADDR_NUM];
+	union l3l4_info_t l3l4_info[32];
+	struct dn200_heartbeat_info heartbeat;
+	unsigned long bitmap_allmucast;
+	unsigned long bitmap_promisc;
+	struct mac_addr_route pf_uc_mac_addr[DN200_MAX_UC_MAC_ADDR_NUM];
+	u8 pf_states;
+	u8 pf_fw_err_states;
+	u8 vf_upgrade_state[DN200_MAX_VF_NUM];
+} __aligned(8);
+
+#define DN200_MAX_SIZE_MAILBOX_INFO (1536)
+_Static_assert((sizeof(struct dn200_mailbox_info) < DN200_MAX_SIZE_MAILBOX_INFO), "struct mailbox_info cannot exceed 2K!!");
+
+#define DN200_SRIOV_QUEUE_OFFSET (DN200_MAX_SIZE_MAILBOX_INFO)
+
+struct dn200_sriov_queue_info {
+	struct dn200_vf_info vf_info[DN200_MAX_VF_NUM];
+} __aligned(8);
+#define DN200_MAX_SIZE_SRIOV_QUEUE_INFO (256)
+_Static_assert((sizeof(struct dn200_sriov_queue_info) < DN200_MAX_SIZE_SRIOV_QUEUE_INFO), "struct sriov_queue cannot exceed 512!!");
+#define DN200_LOCK_INFO_OFFSET (DN200_SRIOV_QUEUE_OFFSET + DN200_MAX_SIZE_SRIOV_QUEUE_INFO)
+struct dn200_sriov_lock_info {
+	/* BIT 0	LOCKED
+	 * BIT 1-16	LOCK_FUNC
+	 * else		reserved
+	 */
+	u32 lock_info;
+} __aligned(8);
+#define DN200_SIZE_MAX_LOCK_INFO (16)
+_Static_assert((sizeof(struct dn200_sriov_lock_info) < DN200_SIZE_MAX_LOCK_INFO), "struct sriov_msix_info cannot exceed 128!!");
+#define DN200_VFMAILBOX_SIZE	16 /* 16 32 bit words - 64 bytes */
+#define DN200_MAILBOX_INFO_OFFSET (DN200_LOCK_INFO_OFFSET + DN200_SIZE_MAX_LOCK_INFO)
+struct dn200_sriov_mbx_info {
+	u32 msg_type;
+	u32 msg_len;
+	u32 msgbuf[DN200_VFMAILBOX_SIZE];
+	struct dn200_heartbeat_info heartbeat;
+} __aligned(8);
+#define DN200_SIZE_SRIOV_MBX_INFO (192)
+_Static_assert((sizeof(struct dn200_sriov_mbx_info) < DN200_SIZE_SRIOV_MBX_INFO), "struct sriov_mbx_info cannot exceed 256!!");
+
+#define DN200_VER_INFO_OFFSET (DN200_MAILBOX_INFO_OFFSET + DN200_SIZE_SRIOV_MBX_INFO)
+struct dn200_sriov_ver_info {
+	struct dn200_ver dn200_ver_info;
+} __aligned(8);
+#define DN200_SIZE_SRIOV_VER_INFO (16)
+_Static_assert((sizeof(struct dn200_sriov_ver_info) < DN200_SIZE_SRIOV_VER_INFO), "struct sriov_ver_info cannot exceed 128!!");
+
+#define DN200_RXP_INFO_OFFSET (DN200_VER_INFO_OFFSET + DN200_SIZE_SRIOV_VER_INFO)
+
+#define LRAM_PRIV_MAC_PF_OFFSET(priv) \
+	((priv)->plat_ex->pf.ioaddr + DN200_SRIOV_MAC_OFFSET + (priv)->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+#define LRAM_MAC_PF_OFFSET(hw) \
+	((hw)->pmail + DN200_SRIOV_MAC_OFFSET + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+#define LRAM_QUEUE_PF_OFFSET(hw) \
+	((hw)->pmail + DN200_SRIOV_QUEUE_OFFSET + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+#define LRAM_LOCK_PF_OFFSET(hw) \
+	((hw)->pmail + DN200_LOCK_INFO_OFFSET + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+#define LRAM_MBX_PF_OFFSET(hw) \
+	((hw)->pmail + DN200_MAILBOX_INFO_OFFSET + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+#define LRAM_PRIV_MAC_VF_OFFSET(priv, vf_id, i) \
+	(LRAM_PRIV_MAC_PF_OFFSET(priv) + offsetof(struct dn200_mailbox_info, uc_mac_addr) + vf_id * 6 + i)
+#define LRAM_VER_PF_OFFSET(hw) \
+	((hw)->pmail + DN200_VER_INFO_OFFSET + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF)
+
+_Static_assert(((DN200_VER_INFO_OFFSET + DN200_SIZE_SRIOV_VER_INFO) < 2048), "LRAM size per PF cannot exceed 2K");
+
+struct dn200_upgrade_info {
+	u32 magic_num;
+	u32 nic_st;
+	u32 host_st;
+	u32 rsv;
+	u32 pf_stop[4];
+	u32 pf_start[4];
+	u32 upgrade_flag;
+} __aligned(8);
+_Static_assert(sizeof(struct dn200_upgrade_info) < 64, "LRAM upgrade info's size cannot exceed 64");
+
+/* ==========================================
+ */
+#define DN200_ITR_SYNC_GET(hw, member, offset, u8data) \
+do { \
+	u8 *__data = u8data;	\
+	*__data = readb(LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, itr_sync_info) + offsetof(struct dn200_itr_sync, member) + (offset)); \
+} while (0)
+
+#define DN200_ITR_SYNC_SET(hw, member, offset, u8data) \
+do { \
+	u8 __data = u8data;	\
+	writeb(__data, LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, itr_sync_info) + offsetof(struct dn200_itr_sync, member) + (offset)); \
+} while (0)
+
+#define DN200_HEARTBEAT_GET(hw, member, vfoffset, u8data) \
+do { \
+	u8 *__data = u8data;	\
+	*__data = readb(LRAM_MBX_PF_OFFSET(hw) + offsetof(struct dn200_sriov_mbx_info, heartbeat) + offsetof(struct dn200_heartbeat_info, member) + (vfoffset)); \
+} while (0)
+
+#define DN200_HEARTBEAT_SET(hw, member, vfoffset, u8data) \
+do { \
+	u8 __data = u8data;	\
+	writeb(__data, LRAM_MBX_PF_OFFSET(hw) + offsetof(struct dn200_sriov_mbx_info, heartbeat) + offsetof(struct dn200_heartbeat_info, member) + (vfoffset)); \
+} while (0)
+
+#define DN200_VF_UPGRADE_GET(priv, vfoffset, u8data) \
+do { \
+	u8 *__data = u8data;	\
+	*__data = readb(LRAM_PRIV_MAC_PF_OFFSET(priv) + offsetof(struct dn200_mailbox_info, vf_upgrade_state) + (vfoffset)); \
+} while (0)
+
+#define DN200_VF_UPGRADE_SET(priv, vfoffset, u8data) \
+do { \
+	u8 __data = u8data;	\
+	writeb(__data, LRAM_PRIV_MAC_PF_OFFSET(priv) + offsetof(struct dn200_mailbox_info, vf_upgrade_state) + (vfoffset)); \
+} while (0)
+
+#define DN200_VF_LINK_GET(priv, vfoffset, u8data) \
+do { \
+	u8 *__data = u8data;	\
+	*__data = readb(LRAM_PRIV_MAC_PF_OFFSET(priv) + offsetof(struct dn200_mailbox_info, vf_link_notify) + (vfoffset)); \
+} while (0)
+
+#define DN200_VF_LINK_SET(priv, vfoffset, u8data) \
+do { \
+	u8 __data = u8data;	\
+	writeb(__data, LRAM_PRIV_MAC_PF_OFFSET(priv) + offsetof(struct dn200_mailbox_info, vf_link_notify) + (vfoffset)); \
+} while (0)
+
+#define DN200_GET_LRAM_MAILBOX_MEMBER(hw, member, data) \
+do { \
+	void *__mptr = (void *)(data); \
+	switch (sizeof_field(struct dn200_mailbox_info, member)) { \
+	case 1:	\
+		*(u8 *)__mptr = readb(LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 2:	\
+		*(u16 *)__mptr = readw(LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 4:	\
+		*(u32 *)__mptr = readl(LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 8:	\
+		*(u64 *)__mptr = readq(LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	default:	\
+		*(u8 *)__mptr = 0;	\
+			break;	\
+	}	\
+} while (0)
+
+#define DN200_SET_LRAM_MAILBOX_MEMBER(hw, member, data) \
+do { \
+	switch (sizeof_field(struct dn200_mailbox_info, member)) { \
+	case 1:	\
+		writeb((u8)data, LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 2:	\
+		writew((u16)data, LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 4:	\
+		writel((u32)data, LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	case 8:	\
+		writeq((u64)data, LRAM_MAC_PF_OFFSET(hw) + offsetof(struct dn200_mailbox_info, member));	\
+		break;	\
+	default:	\
+		break;	\
+	}	\
+} while (0)
+
+#define DN200_GET_LRAM_UPGRADE_MEMBER(hw, member, data) \
+do { \
+	void *__mptr = (void *)(data); \
+	switch (sizeof_field(struct dn200_upgrade_info, member)) { \
+	case 1:	\
+		*(u8 *)__mptr = readb((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 2:	\
+		*(u16 *)__mptr = readw((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 4:	\
+		*(u32 *)__mptr = readl((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 8:	\
+		*(u64 *)__mptr = readq((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	default:	\
+		*(u8 *)__mptr = 0;	\
+			break;	\
+	}	\
+} while (0)
+
+#define DN200_SET_LRAM_UPGRADE_MEMBER(hw, member, data) \
+do { \
+	switch (sizeof_field(struct dn200_upgrade_info, member)) { \
+	case 1:	\
+		writeb((u8)data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 2:	\
+		writew((u16)data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 4:	\
+		writel((u32)data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	case 8:	\
+		writeq((u64)data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, member));	\
+		break;	\
+	default:	\
+		break;	\
+	}	\
+} while (0)
+
+#define DN200_SET_LRAM_UPGRADE_PF(hw, u32data, pf_id) \
+	writel((u32)u32data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, pf_stop) + ((pf_id) * 4)) \
+
+#define DN200_GET_LRAM_UPGRADE_PF(hw, u32data, pf_id) \
+do { \
+	u32 *__data = u32data;	\
+	*__data = readl((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, pf_stop) + ((pf_id) * 4)); \
+} while (0)
+
+#define DN200_SET_LRAM_UPGRADE_PF_FINISH(hw, u32data, pf_id) \
+	writel((u32)u32data, (hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, pf_start) + ((pf_id) * 4)) \
+
+#define DN200_GET_LRAM_UPGRADE_PF_FINISH(hw, u32data, pf_id) \
+do { \
+	u32 *__data = u32data;	\
+	*__data = readl((hw)->pmail - LRAM_UPGRADE_OFFSET + \
+			offsetof(struct dn200_upgrade_info, pf_start) + ((pf_id) * 4)); \
+} while (0)
+
+enum dn200_upgrade_state {
+	DN200_UNFINISH_FLAG = 1,
+	DN200_STOP_FLAG,
+	DN200_JMP_FAIL_FLAG,
+	DN200_START_FLAG,
+};
+/* dn200 vf dev info*/
+/* =========================== */
+
+#define DN200_VF_RXP_ASYNC_INFO_SIZE \
+	sizeof(struct dn200_vf_rxp_async_info)
+
+#define DN200_VF_RXP_WB_INFO_SIZE \
+	sizeof(struct dn200_vf_rxp_async_wb)
+
+#define DN200_VF_TOTAL_RXP_INFO_SIZE (256)
+_Static_assert(((DN200_VF_RXP_ASYNC_INFO_SIZE + DN200_VF_RXP_WB_INFO_SIZE) < DN200_VF_TOTAL_RXP_INFO_SIZE), "RXP INFO exceed 256");
+
+#define DN200_VF_RXP_BASE (16384)
+#define LRAM_VF_RXP_INFO_OFFSET(hw, vf_num) \
+	((hw)->pmail + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF + DN200_RXP_INFO_OFFSET \
+		+ DN200_VF_TOTAL_RXP_INFO_SIZE * (vf_num))
+
+#define LRAM_VF_RXP_WB_INFO_OFFSET(hw, vf_num) \
+	((hw)->pmail + (hw)->priv->plat_ex->pf_id * SRIOV_MAX_SIZE_PER_PF + DN200_RXP_INFO_OFFSET \
+		+ DN200_VF_TOTAL_RXP_INFO_SIZE * (vf_num) + DN200_VF_RXP_ASYNC_INFO_SIZE)
+
+void dn200_sriov_reconfig_hw_feature(struct dn200_priv *priv, struct dma_features *dma_cap);
+int dn200_sriov_configure(struct pci_dev *pdev, int num_vfs);
+int dn200_sriov_disable(struct dn200_priv *priv);
+int dn200_sriov_enable(struct dn200_priv *priv, int num_vfs);
+void dn200_get_func_mac_addr(struct mac_device_info *hw, u8 offset, struct mac_addr_route *mac_addr);
+void dn200_set_func_mac_addr(struct mac_device_info *hw, u8 offset, struct mac_addr_route *mac_addr);
+void dn200_get_func_uc_mac_addr(struct mac_device_info *hw, u8 offset, struct mac_addr_route *mac_addr);
+void dn200_set_func_uc_mac_addr(struct mac_device_info *hw, u8 offset, struct mac_addr_route *mac_addr);
+int dn200_hw_lock(struct mac_device_info *hw, bool *is_locked);
+void dn200_hw_unlock(struct mac_device_info *hw, bool *is_locked);
+void dn200_get_vf_queue_info(void __iomem *mailbox, struct dn200_vf_info *info, u8 pf_id, u8 funcid);
+int dn200_get_prev_used_bit(unsigned long *bitmap, u8 offset);
+int dn200_get_next_used_bit(unsigned long *bitmap, u8 offset, u8 last);
+int dn200_get_unused_bit(unsigned long *bitmap, u8 last, u8 first);
+void dn200_pf_set_link_status(struct mac_device_info *hw, u8 link_status);
+u8 dn200_vf_get_link_status(struct mac_device_info *hw);
+void dn200_set_phy_info(struct mac_device_info *hw, struct dn200_sriov_phy_info *info);
+void dn200_get_phy_info(struct mac_device_info *hw, struct dn200_sriov_phy_info *info);
+int dn200_get_l3l4_filter_offset(struct mac_device_info *hw, int idx);
+int dn200_set_l3l4_filter_info(struct mac_device_info *hw, int idx, int offset, bool clear);
+void dn200_sriov_mail_init(struct dn200_priv *priv);
+void dn200_sriov_static_init(struct dn200_priv *priv);
+int dn200_sriov_vlan_entry_update(struct dn200_priv *priv);
+u16 dn200_get_vxlan_status(struct mac_device_info *hw);
+void dn200_set_vxlan_status(struct mac_device_info *hw, u16 vxlan_status);
+int dn200_vf_glb_err_rst_notify(struct dn200_priv *priv);
+int dn200_pf_glb_err_rst_process(struct dn200_priv *priv);
+void dn200_vf_flow_state_set(struct dn200_priv *priv, u8 flow_sate);
+bool dn200_all_vf_flow_state_wait(struct dn200_priv *priv, u8 wait_state, bool in_task);
+void dn200_all_vf_flow_state_clear(struct dn200_priv *priv);
+void dn200_vf_flow_close(struct dn200_priv *priv);
+void dn200_vf_flow_open(struct dn200_priv *priv);
+void _dn200_vf_flow_open(struct dn200_priv *priv);
+void dn200_sriov_ver_get(struct dn200_priv *priv, struct dn200_ver *ver_info);
+void dn200_sriov_ver_set(struct dn200_priv *priv, struct dn200_ver *ver_info);
+void dn200_reset_lram_rxp_async_info(struct mac_device_info *hw);
+void dn200_get_lram_rxp_async_crc32(struct mac_device_info *hw, u8 vf_num, u8 *info);
+void dn200_set_lram_rxp_async_info(struct mac_device_info *hw, u8 *info);
+void dn200_get_lram_rxp_async_info(struct mac_device_info *hw, u8 *info, u8 vf_num);
+void dn200_set_lram_rxp_wb_info(struct mac_device_info *hw, u8 *info);
+void dn200_get_lram_rxp_wb_info(struct mac_device_info *hw, u8 *info, u8 vf_num);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/dwxgmac2_core.c b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_core.c
new file mode 100644
index 000000000000..6fa8eedbdd02
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_core.c
@@ -0,0 +1,4457 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/bitrev.h>
+#include <linux/crc32.h>
+#include <linux/iopoll.h>
+#include "dn200.h"
+#include "dn200_ptp.h"
+#include "dwxgmac_comm.h"
+
+static int dn200_del_vf_uc_rxp_da_route(struct mac_device_info *hw,
+					    int offset);
+static int dn200_del_pf_uc_rxp_da_route(struct mac_device_info *hw,
+					    int offset);
+static int dn200_add_pf_uc_rxp_da_route_sriov(struct mac_device_info *hw,
+						 u8 *mac_addr, int offset);
+static int dn200_add_vf_uc_rxp_da_route_sriov(struct mac_device_info *hw,
+						 u8 *mac_addr, int offset, u8 rxq_start);
+static void dn200_clear_mc_da_route(struct mac_device_info *hw, u8 rxq_start);
+static int dn200_mc_add_rxp(struct mac_device_info *hw, u8 *mac_addr, u8 rxq_start);
+static int dn200_pf_lram_uc_add_rxp(struct mac_device_info *hw,
+				       u8 *mac_addr);
+static void dn200_update_bcmc_channel(struct mac_device_info *hw,
+					int rxp_offset, bool add, u8 rxq_start);
+static void dn200_update_ampm_rxp(struct mac_device_info *hw, u8 rxq_start);
+static int dwxgmac_reset_rxp(struct mac_device_info *hw);
+static void dn200_clear_lram_pf_uc_rxp(struct mac_device_info *hw);
+static int dwxgmac2_rxp_get_single_entry_sriov(struct mac_device_info *hw,
+					       u32 *data, int real_pos);
+static int dwxgmac2_rxp_update_single_entry_sriov(struct mac_device_info *hw,
+						  u32 data, int real_pos);
+static int dwxgmac2_rxp_get_single_entry_sriov(struct mac_device_info *hw,
+					       u32 *data, int real_pos);
+
+static int dwxgmac2_rxp_update_single_entry_sriov(struct mac_device_info *hw,
+						  u32 data, int real_pos);
+static int dwxgmac2_rxp_update_single_da_entry_sriov(struct mac_device_info *hw,
+						     struct RXP_FPR_ENTRY
+						     *entry, int pos);
+static void dwxgmac3_rxp_enable(void __iomem *ioaddr);
+
+static void dwxgmac2_map_mtl_to_dma(struct mac_device_info *hw, u32 queue,
+				    u32 chan);
+static void dn200_clear_allmu_promisc_da_route(struct mac_device_info *hw,
+						 int offset, u8 rxq_start);
+static int dwxgmac2_rxp_get_single_da_entry_sriov(struct mac_device_info *hw,
+						  struct RXP_FPR_ENTRY *entry,
+						  int pos);
+static int rxp_offset_get_from_bitmap(int bitmap_off);
+static int dn200_get_used_bit_from_last(unsigned long *bitmap, u8 last,
+					u8 first);
+static void dn200_mc_rxp_channel_route_set(struct mac_device_info *hw,
+					      bool enable, u16 bitmap_promisc);
+
+static void dwxgmac2_core_init(struct mac_device_info *hw,
+			       struct net_device *dev)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 tx, rx;
+
+	tx = readl(ioaddr + XGMAC_TX_CONFIG);
+	rx = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	tx |= XGMAC_CORE_INIT_TX;
+	rx |= XGMAC_CORE_INIT_RX;
+
+	if (hw->ps) {
+		tx |= XGMAC_CONFIG_TE;
+		tx &= ~hw->link.speed_mask;
+
+		switch (hw->ps) {
+		case SPEED_10000:
+			tx |= hw->link.xgmii.speed10000;
+			break;
+		case SPEED_2500:
+			tx |= hw->link.speed2500;
+			break;
+		case SPEED_1000:
+		default:
+			tx |= hw->link.speed1000;
+			break;
+		}
+	}
+	if (HW_IS_VF(hw))
+		return;
+	writel(tx, ioaddr + XGMAC_TX_CONFIG);
+	writel(rx | XGMAC_CONFIG_CST | XGMAC_CONFIG_ACS,
+	       ioaddr + XGMAC_RX_CONFIG);
+	writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN);
+}
+
+static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable,
+			     struct mac_device_info *hw)
+{
+	u32 tx = readl(ioaddr + XGMAC_TX_CONFIG);
+	u32 rx = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	if (enable) {
+		tx |= XGMAC_CONFIG_TE;
+		rx |= XGMAC_CONFIG_RE;
+	} else {
+		tx &= ~XGMAC_CONFIG_TE;
+		rx &= ~XGMAC_CONFIG_RE;
+	}
+	if (HW_IS_VF(hw))
+		return;
+	writel(tx, ioaddr + XGMAC_TX_CONFIG);
+	writel(rx, ioaddr + XGMAC_RX_CONFIG);
+}
+
+static void dwxgmac2_mac_rx_set(void __iomem *ioaddr, bool enable)
+{
+	u32 rx = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	if (enable)
+		rx |= XGMAC_CONFIG_RE;
+	else
+		rx &= ~XGMAC_CONFIG_RE;
+
+	writel(rx, ioaddr + XGMAC_RX_CONFIG);
+}
+
+static int dwxgmac2_mac_rx_get(void __iomem *ioaddr)
+{
+	u32 rx = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	return !!(rx & XGMAC_CONFIG_RE);
+}
+
+static int dwxgmac2_rx_ipc(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_RX_CONFIG);
+	if (hw->rx_csum)
+		value |= XGMAC_CONFIG_IPC;
+	else
+		value &= ~XGMAC_CONFIG_IPC;
+	writel(value, ioaddr + XGMAC_RX_CONFIG);
+
+	return !!(readl(ioaddr + XGMAC_RX_CONFIG) & XGMAC_CONFIG_IPC);
+}
+
+static void dwxgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode,
+				     u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	queue += DN200_RXQ_START_GET(hw);
+	if (!DN200_MTL_QUEUE_IS_VALID(hw->priv, queue))
+		return;
+
+	value = readl(ioaddr + XGMAC_RXQ_CTRL0) & ~XGMAC_RXQEN(queue);
+	if (mode == MTL_QUEUE_AVB)
+		value |= 0x1 << XGMAC_RXQEN_SHIFT(queue);
+	else if (mode == MTL_QUEUE_DCB)
+		value |= 0x2 << XGMAC_RXQEN_SHIFT(queue);
+	writel(value, ioaddr + XGMAC_RXQ_CTRL0);
+}
+
+static void dwxgmac2_rx_queue_disable(struct mac_device_info *hw, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	queue += DN200_RXQ_START_GET(hw);
+	if (!DN200_MTL_QUEUE_IS_VALID(hw->priv, queue))
+		return;
+
+	value = readl(ioaddr + XGMAC_RXQ_CTRL0) & ~XGMAC_RXQEN(queue);
+	writel(value, ioaddr + XGMAC_RXQ_CTRL0);
+}
+
+static void dwxgmac2_rx_dds_config_sriov(struct mac_device_info *hw,
+					 bool enable)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	if (!enable) {
+		/*disable DDS */
+		value = readl(ioaddr + XGMAC_MAC_EXT_CONF);
+		value &= ~XGMAC_DDS_ENABLE;
+		writel(value, ioaddr + XGMAC_MAC_EXT_CONF);
+
+		//clear mcbc router
+		value = readl(ioaddr + XGMAC_RXQ_CTRL1);
+		value &= ~XGMAC_MCBCQEN;
+		value &= ~XGMAC_MCBCQ;
+		writel(value, ioaddr + XGMAC_RXQ_CTRL1);
+	} else {
+		/*enable DDS */
+		value = readl(ioaddr + XGMAC_MAC_EXT_CONF);
+		value |= XGMAC_DDS_ENABLE;
+		writel(value, ioaddr + XGMAC_MAC_EXT_CONF);
+		value = readl(ioaddr + XGMAC_RXQ_CTRL4);
+		value |= XGMAC_UDC;
+		writel(value, ioaddr + XGMAC_RXQ_CTRL4);
+		//route mcbc to rx queue 15
+		value = readl(ioaddr + XGMAC_RXQ_CTRL1);
+		value |= XGMAC_MCBCQEN;
+		value |= ((DN200_LAST_QUEUE(hw->priv)) << XGMAC_MCBCQ_SHIFT);
+		value &= ~(XGMAC_RQ);
+		value |= (DN200_LAST_QUEUE(hw->priv) << XGMAC_RQ_SHIFT);
+		value &= ~(XGMAC_UPQ);
+		value |= (DN200_LAST_QUEUE(hw->priv));
+		writel(value, ioaddr + XGMAC_RXQ_CTRL1);
+	}
+}
+
+static void dwxgmac2_rx_queue_prio(struct mac_device_info *hw, u32 prio,
+				   u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, reg;
+
+	if (HW_IS_VF(hw))
+		return;
+	queue += DN200_RXQ_START_GET(hw);
+	reg = (queue < 4) ? XGMAC_RXQ_CTRL2 : XGMAC_RXQ_CTRL3;
+	if (queue >= 4)
+		queue -= 4;
+
+	value = readl(ioaddr + reg);
+	value &= ~XGMAC_PSRQ(queue);
+	value |= (prio << XGMAC_PSRQ_SHIFT(queue)) & XGMAC_PSRQ(queue);
+
+	writel(value, ioaddr + reg);
+}
+
+static void dwxgmac2_tx_queue_prio(struct mac_device_info *hw, u32 prio,
+				   u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, reg;
+
+	if (HW_IS_VF(hw))
+		return;
+	queue += DN200_RXQ_START_GET(hw);
+	reg = (queue < 4) ? XGMAC_TC_PRTY_MAP0 : XGMAC_TC_PRTY_MAP1;
+	if (queue >= 4)
+		queue -= 4;
+
+	value = readl(ioaddr + reg);
+	value &= ~XGMAC_PSTC(queue);
+	value |= (prio << XGMAC_PSTC_SHIFT(queue)) & XGMAC_PSTC(queue);
+
+	writel(value, ioaddr + reg);
+}
+
+static void dwxgmac2_prog_mtl_rx_algorithms(struct mac_device_info *hw,
+					    u32 rx_alg)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_MTL_OPMODE);
+	value &= ~XGMAC_RAA;
+
+	switch (rx_alg) {
+	case MTL_RX_ALGORITHM_SP:
+		break;
+	case MTL_RX_ALGORITHM_WSP:
+		value |= XGMAC_RAA;
+		break;
+	default:
+		break;
+	}
+
+	writel(value, ioaddr + XGMAC_MTL_OPMODE);
+}
+
+static void dwxgmac2_prog_mtl_tx_algorithms(struct mac_device_info *hw,
+					    u32 tx_alg)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	bool ets = true;
+	u32 value;
+	int i;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_MTL_OPMODE);
+	value &= ~XGMAC_ETSALG;
+
+	switch (tx_alg) {
+	case MTL_TX_ALGORITHM_WRR:
+		value |= XGMAC_WRR;
+		break;
+	case MTL_TX_ALGORITHM_WFQ:
+		value |= XGMAC_WFQ;
+		break;
+	case MTL_TX_ALGORITHM_DWRR:
+		value |= XGMAC_DWRR;
+		break;
+	default:
+		ets = false;
+		break;
+	}
+
+	writel(value, ioaddr + XGMAC_MTL_OPMODE);
+
+	/* Set ETS if desired */
+	for (i = 0; i < MTL_MAX_TX_QUEUES; i++) {
+		value = readl(ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+		value &= ~XGMAC_TSA;
+		if (ets)
+			value |= XGMAC_ETS;
+		writel(value, ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(i));
+	}
+}
+
+static void dwxgmac2_set_mtl_tx_queue_weight(struct mac_device_info *hw,
+					     u32 weight, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+
+	queue += DN200_RXQ_START_GET(hw);
+	writel(weight, ioaddr + XGMAC_MTL_TCx_QUANTUM_WEIGHT(queue));
+}
+
+static void dwxgmac2_set_mtl_rx_queue_weight(struct mac_device_info *hw,
+					     u32 weight, u32 queue)
+{
+	u32 value;
+	void __iomem *ioaddr = hw->pcsr;
+
+	queue += DN200_RXQ_START_GET(hw);
+
+	value = readl(ioaddr + XGMAC_MTL_RXQ_WEIGHT(queue));
+	value &= ~XGMAC_RXQ_WEIGHT;
+	value |= weight & XGMAC_RXQ_WEIGHT;
+	writel(value, ioaddr + XGMAC_MTL_RXQ_WEIGHT(queue));
+}
+
+static void dwxgmac2_mtl_reset(struct mac_device_info *hw, u32 queue, u32 chan,
+			       u8 mode)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, reg, que;
+	int i;
+
+	queue += DN200_RXQ_START_GET(hw);
+	chan += DN200_RXQ_START_GET(hw);
+	/* weight reset */
+	writel(0, ioaddr + XGMAC_MTL_TCx_QUANTUM_WEIGHT(queue));
+
+	/*cbs */
+	writel(0, ioaddr + XGMAC_MTL_TCx_SENDSLOPE(queue));
+	writel(0, ioaddr + XGMAC_MTL_TCx_QUANTUM_WEIGHT(queue));
+	writel(0, ioaddr + XGMAC_MTL_TCx_HICREDIT(queue));
+	writel(0, ioaddr + XGMAC_MTL_TCx_LOCREDIT(queue));
+	writel(0, ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(queue));
+
+	/*map */
+	reg = (queue / 4) * 4 + XGMAC_MTL_RXQ_DMA_MAP0;
+	que = queue % 4;
+	value = readl(ioaddr + reg);
+	value &= ~XGMAC_QxMDMACH(que);
+	writel(value, ioaddr + reg);
+
+	/*disable queue */
+	value = readl(ioaddr + XGMAC_RXQ_CTRL0) & ~XGMAC_RXQEN(queue);
+	writel(value, ioaddr + XGMAC_RXQ_CTRL0);
+	value = readl(ioaddr + XGMAC_RXQ_CTRL0) & ~XGMAC_RXQEN(queue);
+	for (i = 0; i < XGMAC_PER_REGSIZE - 0x24; i = i + 4)
+		writel(0, ioaddr + XGMAC_MTL_TXQ_OPMODE(queue) + i);
+
+	writel(readl(ioaddr + XGMAC_MTL_QINTEN(queue)) & (~(BIT(16))),
+	       ioaddr + XGMAC_MTL_QINTEN(queue));
+	writel(~0, ioaddr + XGMAC_MTL_QINT_STATUS(queue));
+	writel(readl(ioaddr + XGMAC_MTL_QINT_STATUS(queue)) & (~(BIT(1))),
+	       ioaddr + XGMAC_MTL_QINT_STATUS(queue));
+}
+
+static int dwxgmac2_mtl_flush(struct mac_device_info *hw, u32 queue)
+{
+	u32 val;
+	void __iomem *ioaddr = hw->pcsr;
+
+	queue += DN200_RXQ_START_GET(hw);
+	val = readl(ioaddr + XGMAC_MTL_TXQ_OPMODE(queue));
+	writel((val | 1), ioaddr + XGMAC_MTL_TXQ_OPMODE(queue));
+	/* Wait for done */
+	return readl_poll_timeout(ioaddr + XGMAC_MTL_TXQ_OPMODE(queue),
+			   val, !(val & BIT(0)), 100, 100000);
+}
+
+static void dwxgmac2_map_mtl_to_dma(struct mac_device_info *hw, u32 queue,
+				    u32 chan)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, reg;
+
+	queue += DN200_RXQ_START_GET(hw);
+	chan += DN200_RXQ_START_GET(hw);
+	reg = (queue / 4) * 4 + XGMAC_MTL_RXQ_DMA_MAP0;
+	queue %= 4;
+	value = readl(ioaddr + reg);
+	value &= ~XGMAC_QxMDMACH(queue);
+	value |= (chan << XGMAC_QxMDMACH_SHIFT(queue)) & XGMAC_QxMDMACH(queue);
+
+	writel(value, ioaddr + reg);
+}
+
+static void dwxgmac2_mtl_dynamic_chan_set(struct mac_device_info *hw,
+					  u32 queue, bool dynamic)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, reg;
+
+	queue += DN200_RXQ_START_GET(hw);
+	if (!DN200_MTL_QUEUE_IS_VALID(hw->priv, queue))
+		return;
+
+	reg = (queue / 4) * 4 + XGMAC_MTL_RXQ_DMA_MAP0;
+	queue %= 4;
+	value = readl(ioaddr + reg);
+	if (dynamic)
+		value |= XGMAC_QxMDMACH_DYN_SEL(queue);
+	else
+		value &= ~XGMAC_QxMDMACH_DYN_SEL(queue);
+
+	writel(value, ioaddr + reg);
+}
+
+static void dwxgmac2_config_cbs(struct mac_device_info *hw,
+				u32 send_slope, u32 idle_slope,
+				u32 high_credit, u32 low_credit, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	queue += DN200_RXQ_START_GET(hw);
+	writel(send_slope, ioaddr + XGMAC_MTL_TCx_SENDSLOPE(queue));
+	writel(idle_slope, ioaddr + XGMAC_MTL_TCx_QUANTUM_WEIGHT(queue));
+	writel(high_credit, ioaddr + XGMAC_MTL_TCx_HICREDIT(queue));
+	writel(low_credit, ioaddr + XGMAC_MTL_TCx_LOCREDIT(queue));
+
+	value = readl(ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(queue));
+	value &= ~XGMAC_TSA;
+	value |= XGMAC_CC | XGMAC_CBS;
+	writel(value, ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(queue));
+}
+
+static void dwxgmac2_dump_regs(struct mac_device_info *hw, u32 *reg_space)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	int i;
+
+	for (i = 0; i < XGMAC_MAC_REGSIZE; i++)
+		reg_space[i] = readl(ioaddr + i * 4);
+}
+
+static int dwxgmac2_host_irq_status(struct mac_device_info *hw,
+				    struct dn200_extra_stats *x)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 stat, en;
+	int ret = 0;
+
+	en = readl(ioaddr + XGMAC_INT_EN);
+	stat = readl(ioaddr + XGMAC_INT_STATUS);
+
+	stat &= en;
+
+	if (stat & XGMAC_PMTIS) {
+		x->irq_receive_pmt_irq_n++;
+		readl(ioaddr + XGMAC_PMT);
+	}
+
+	if (stat & XGMAC_LPIIS) {
+		u32 lpi = readl(ioaddr + XGMAC_LPI_CTRL);
+
+		if (lpi & XGMAC_TLPIEN) {
+			ret |= CORE_IRQ_TX_PATH_IN_LPI_MODE;
+			x->irq_tx_path_in_lpi_mode_n++;
+		}
+		if (lpi & XGMAC_TLPIEX) {
+			ret |= CORE_IRQ_TX_PATH_EXIT_LPI_MODE;
+			x->irq_tx_path_exit_lpi_mode_n++;
+		}
+		if (lpi & XGMAC_RLPIEN)
+			x->irq_rx_path_in_lpi_mode_n++;
+		if (lpi & XGMAC_RLPIEX)
+			x->irq_rx_path_exit_lpi_mode_n++;
+	}
+
+	return ret;
+}
+
+static int dwxgmac2_host_mtl_irq_status(struct mac_device_info *hw, u32 chan)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	int ret = 0;
+	u32 status;
+
+	chan += DN200_RXQ_START_GET(hw);
+	status = readl(ioaddr + XGMAC_MTL_INT_STATUS);
+	if (status & BIT(chan)) {
+		u32 chan_status = readl(ioaddr + XGMAC_MTL_QINT_STATUS(chan));
+
+		if (chan_status & XGMAC_RXOVFIS)
+			ret |= CORE_IRQ_MTL_RX_OVERFLOW;
+
+		writel(~0x0, ioaddr + XGMAC_MTL_QINT_STATUS(chan));
+	}
+
+	return ret;
+}
+
+static void dwxgmac2_flow_ctrl(struct mac_device_info *hw, unsigned int duplex,
+			       unsigned int fc, unsigned int pause_time,
+			       u32 tx_cnt)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	struct dn200_priv *priv = hw->priv;
+	u32 i;
+
+	if (fc & FLOW_RX) {
+		writel(XGMAC_RFE, ioaddr + XGMAC_RX_FLOW_CTRL);
+	} else {
+		if (priv->pfc && priv->pfc->pfc_en) {
+			/* pfc mode enabled, do not disable flow control */
+		} else {
+			u32 value = readl(ioaddr + XGMAC_RX_FLOW_CTRL);
+
+			writel(value & ~XGMAC_RFE, ioaddr + XGMAC_RX_FLOW_CTRL);
+		}
+	}
+	if (fc & FLOW_TX) {
+		for (i = 0; i < tx_cnt; i++) {
+			u32 value = XGMAC_TFE;
+
+			if (duplex)
+				value |= pause_time << XGMAC_PT_SHIFT;
+
+			writel(value,
+			       ioaddr +
+			       XGMAC_Qx_TX_FLOW_CTRL((i +
+						      DN200_RXQ_START_GET
+						      (hw))));
+		}
+	} else {
+		if (priv->pfc && priv->pfc->pfc_en) {
+			/* pfc mode enabled, do not disable flow control */
+		} else {
+			for (i = 0; i < tx_cnt; i++) {
+				u32 value =
+				    readl(ioaddr +
+					  XGMAC_Qx_TX_FLOW_CTRL((i +
+								 DN200_RXQ_START_GET
+								 (hw))));
+
+				value &= ~XGMAC_TFE;
+				value &= ~(0xffff << XGMAC_PT_SHIFT);
+
+				writel(value,
+				       ioaddr +
+				       XGMAC_Qx_TX_FLOW_CTRL((i +
+							      DN200_RXQ_START_GET
+							      (hw))));
+			}
+		}
+	}
+}
+
+static int dwxgmac2_indiraccess_write(struct mac_device_info *hw, u32 addr_off,
+				      u8 mode_sel, u32 data);
+static int dwxgmac2_wq_set_umac_addr(struct mac_device_info *hw,
+				   unsigned char *addr, unsigned int reg_n,
+				   struct dn200_vf_rxp_async_info *async_info)
+{
+	u32 value;
+	int ret = 0;
+
+	if (async_info->is_vf) {
+		reg_n += 1 + async_info->vf_offset;
+		ret = dn200_del_vf_uc_rxp_da_route(hw,
+						DN200_VF_UC_OFF + async_info->vf_offset);
+		if (ret < 0) {
+			netdev_err(hw->priv->dev, "%s: del vf uc fail, ret:%d.\n", __func__, ret);
+			goto XDCS;
+		}
+
+		ret = dn200_add_vf_uc_rxp_da_route_sriov(hw, &async_info->uc_mac_addr[0],
+								DN200_VF_UC_OFF + async_info->vf_offset, async_info->rxq_start);
+		if (ret < 0) {
+			netdev_err(hw->priv->dev, "%s: add vf uc fail, ret:%d.\n", __func__, ret);
+			goto XDCS;
+		}
+	} else {
+		ret = dn200_del_pf_uc_rxp_da_route(hw, 1);
+		if (ret < 0) {
+			netdev_err(hw->priv->dev, "%s: del pf uc fail, ret:%d.\n", __func__, ret);
+			goto XDCS;
+		}
+		ret = dn200_add_pf_uc_rxp_da_route_sriov(hw, addr, 1);
+		if (ret < 0) {
+			netdev_err(hw->priv->dev, "%s: add pf uc fail, ret:%d.\n", __func__, ret);
+			goto XDCS;
+		}
+	}
+XDCS:
+	//enable XDCS
+	value = 1 << async_info->rxq_start;
+	ret = dwxgmac2_indiraccess_write(hw, (reg_n & 0x1f), XGMAC_INDIR_DCHSEL,
+					value);
+	return ret;
+}
+
+static void dwxgmac2_vf_set_async_info(struct mac_device_info *hw,
+				      struct net_device *dev, u8 *wakeup_wq,
+					  u8 type);
+
+static int dwxgmac2_set_umac_addr(struct mac_device_info *hw,
+				   unsigned char *addr, unsigned int reg_n, u8 *wakeup_wq)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret = 0;
+
+	if (HW_IS_VF(hw)) {
+		dwxgmac2_vf_set_async_info(hw, hw->priv->dev, wakeup_wq, DN200_VF_SET_UMAC);
+		return 0;
+	} else if (!HW_IS_PUREPF(hw)) {
+		*wakeup_wq = true;
+		return 0;
+	}
+
+	value = (DN200_RXQ_START_GET(hw) << 16) | (addr[5] << 8) | addr[4];
+	writel(value | XGMAC_AE, ioaddr + XGMAC_ADDRX_HIGH(reg_n));
+
+	value = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0];
+	writel(value, ioaddr + XGMAC_ADDRX_LOW(reg_n));
+	return ret;
+}
+
+static void dwxgmac2_get_umac_addr(struct mac_device_info *hw,
+				   unsigned char *addr, unsigned int reg_n)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 hi_addr, lo_addr;
+
+	reg_n += DN200_RXQ_START_GET(hw);
+	/* Read the MAC address from the hardware */
+	hi_addr = readl(ioaddr + XGMAC_ADDRX_HIGH(reg_n));
+	lo_addr = readl(ioaddr + XGMAC_ADDRX_LOW(reg_n));
+
+	/* Extract the MAC address from the high and low words */
+	addr[0] = lo_addr & 0xff;
+	addr[1] = (lo_addr >> 8) & 0xff;
+	addr[2] = (lo_addr >> 16) & 0xff;
+	addr[3] = (lo_addr >> 24) & 0xff;
+	addr[4] = hi_addr & 0xff;
+	addr[5] = (hi_addr >> 8) & 0xff;
+}
+
+static void dwxgmac2_set_eee_mode(struct mac_device_info *hw,
+				  bool en_tx_lpi_clockgating, bool en_tx_lpi_auto_timer)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_LPI_CTRL);
+
+	value |= XGMAC_LPITXEN | XGMAC_LPITXA;
+	if (en_tx_lpi_clockgating)
+		value |= XGMAC_TXCGE;
+
+	if (en_tx_lpi_auto_timer)
+		value |= XGMAC_LPIATE;
+	else
+		value &= ~XGMAC_LPIATE;
+
+	writel(value, ioaddr + XGMAC_LPI_CTRL);
+}
+
+static void dwxgmac2_reset_eee_mode(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_LPI_CTRL);
+	value &= ~(XGMAC_LPITXEN | XGMAC_LPITXA | XGMAC_TXCGE | XGMAC_LPIATE);
+	writel(value, ioaddr + XGMAC_LPI_CTRL);
+}
+
+static void dwxgmac2_set_eee_pls(struct mac_device_info *hw, int link)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_LPI_CTRL);
+	if (link)
+		value |= XGMAC_PLS;
+	else
+		value &= ~XGMAC_PLS;
+	writel(value, ioaddr + XGMAC_LPI_CTRL);
+}
+
+static void dwxgmac2_set_eee_timer(struct mac_device_info *hw, int ls, int tw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = (tw & 0xffff) | ((ls & 0x3ff) << 16);
+	writel(value, ioaddr + XGMAC_LPI_TIMER_CTRL);
+}
+
+static void dwxgmac2_set_mchash(void __iomem *ioaddr, u32 *mcfilterbits,
+				int mcbitslog2)
+{
+	int numhashregs, regs;
+
+	switch (mcbitslog2) {
+	case 6:
+		numhashregs = 2;
+		break;
+	case 7:
+		numhashregs = 4;
+		break;
+	case 8:
+		numhashregs = 8;
+		break;
+	default:
+		return;
+	}
+
+	for (regs = 0; regs < numhashregs; regs++)
+		writel(mcfilterbits[regs], ioaddr + XGMAC_HASH_TABLE(regs));
+}
+
+static void dwxgamc_get_change(struct mac_device_info *hw,
+			       struct net_device *dev)
+{
+	unsigned long bitmap_am;
+	unsigned long bitmap_pm;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, &bitmap_am);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, &bitmap_pm);
+	if (bitmap_am & (1 << DN200_RXQ_START_GET(hw)))
+		hw->set_state.is_allmuslt = true;
+	else
+		hw->set_state.is_allmuslt = false;
+
+	if (bitmap_pm & (1 << DN200_RXQ_START_GET(hw)))
+		hw->set_state.is_promisc = true;
+	else
+		hw->set_state.is_promisc = false;
+
+	hw->set_state.uc_num =
+	    netdev_uc_count(dev) >
+	    DN200_MAX_UC_MAC_ADDR_NUM ? DN200_MAX_UC_MAC_ADDR_NUM :
+	    netdev_uc_count(dev);
+	hw->set_state.mc_num =
+	    netdev_mc_count(dev) >
+	    DN200_MAX_MC_ADDR_NUM ? DN200_MAX_MC_ADDR_NUM :
+	    netdev_mc_count(dev);
+}
+
+static void dwxgamc_set_vf_change(struct mac_device_info *hw,
+			       struct net_device *dev, u8 rxq_start, u32 seq)
+{
+	unsigned long bitmap_am;
+	unsigned long bitmap_pm;
+	struct dn200_vf_rxp_async_wb wb;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, &bitmap_am);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, &bitmap_pm);
+	if (bitmap_am & (1 << rxq_start))
+		wb.is_promisc = true;
+	else
+		wb.is_promisc = false;
+
+	if (bitmap_pm & (1 << rxq_start))
+		wb.is_allmuslt = true;
+	else
+		wb.is_allmuslt = false;
+
+	wb.uc_num = 0;
+	wb.mc_num =
+	    netdev_mc_count(dev) >
+	    DN200_MAX_MC_ADDR_NUM ? DN200_MAX_MC_ADDR_NUM :
+	    netdev_mc_count(dev);
+	wb.seq = seq;
+	wb.crc32 = crc32_le(~0, (u8 *)&wb + sizeof(u32), sizeof(wb) - sizeof(u32));
+	dn200_set_lram_rxp_wb_info(hw, (u8 *)&wb);
+}
+
+static int dwxgamc_comp_change(struct mac_device_info *hw,
+			       struct net_device *dev)
+{
+	int ret = 1;
+	int uc_num = 0;
+	int mc_num = 0;
+
+	if (netdev_uc_count(dev) > DN200_MAX_UC_MAC_ADDR_NUM)
+		uc_num = DN200_MAX_UC_MAC_ADDR_NUM;
+	else
+		uc_num = netdev_uc_count(dev);
+	if (netdev_mc_count(dev) > DN200_MAX_MC_ADDR_NUM)
+		mc_num = DN200_MAX_MC_ADDR_NUM;
+	else
+		mc_num = netdev_mc_count(dev);
+
+	if ((dev->flags & IFF_PROMISC)) {
+		if (!hw->set_state.is_promisc)
+			return ret;
+	} else {
+		if (hw->set_state.is_promisc)
+			return ret;
+	}
+	if (dev->flags & IFF_ALLMULTI) {
+		if (!hw->set_state.is_allmuslt)
+			return ret;
+	} else {
+		if (hw->set_state.is_allmuslt)
+			return ret;
+	}
+	if (hw->set_state.uc_num != uc_num)
+		return ret;
+
+	if (hw->set_state.mc_num != mc_num)
+		return ret;
+	return 0;
+}
+
+static void dwxgmac2_vf_set_async_info(struct mac_device_info *hw,
+				      struct net_device *dev, u8 *wakeup_wq,
+					  u8 type)
+{
+	size_t info_size = sizeof(struct dn200_vf_rxp_async_info);
+	struct dn200_vf_rxp_async_info *async_info = NULL;
+	struct netdev_hw_addr *ha;
+	u8 mc_of = 0;
+	u32 crc32, tmp_crc32;
+
+	dn200_get_lram_rxp_async_crc32(hw, DN200_VF_OFFSET_GET(hw), (u8 *)&tmp_crc32);
+	async_info = devm_kzalloc(hw->priv->device, info_size, GFP_ATOMIC);
+	if (!async_info)
+		return;
+	//to do, delete this
+	dn200_get_lram_rxp_async_info(hw, (u8 *)async_info, DN200_VF_OFFSET_GET(hw));
+	netdev_dbg(hw->priv->dev, "%s %d cur seq %d tmp seq %d\n", __func__, __LINE__, async_info->seq, hw->cfg_rxp_seq);
+	memset(async_info, 0, info_size);
+
+	async_info->flags = dev->flags;
+	async_info->vf_offset = DN200_VF_OFFSET_GET(hw);
+	async_info->seq = hw->cfg_rxp_seq + 1;
+	async_info->mc_cnt = netdev_mc_count(dev);
+	async_info->uc_cnt = netdev_uc_count(dev);
+	ether_addr_copy((u8 *)&async_info->uc_mac_addr[0], dev->dev_addr);
+
+	netdev_for_each_mc_addr(ha, dev) {
+		ether_addr_copy((u8 *)&async_info->mc_mac_addr[mc_of], ha->addr);
+		mc_of++;
+		if (mc_of >= DN200_MAX_MC_ADDR_NUM)
+			break;
+	}
+	async_info->rxq_start = DN200_RXQ_START_GET(hw);
+	async_info->type = type;
+	async_info->is_vf = true;
+	crc32 = crc32_le(~0, (u8 *)async_info + sizeof(u32), info_size - sizeof(u32));
+	async_info->crc32 = crc32;
+	netdev_dbg(hw->priv->dev, "%s %d new crc32 %#x old crc32 %#x seq %d\n", __func__, __LINE__, crc32, tmp_crc32, async_info->seq);
+	if (crc32 != tmp_crc32) {
+		dn200_reset_lram_rxp_async_info(hw);
+		dn200_set_lram_rxp_async_info(hw, (u8 *)async_info);
+		hw->cfg_rxp_seq++;
+		*wakeup_wq = true;
+	}
+	devm_kfree(hw->priv->device, async_info);
+}
+
+static void dwxgmac2_set_filter_sriov(struct mac_device_info *hw,
+				      struct net_device *dev, u8 *wakeup_wq)
+{
+	int ret = 0;
+
+	*wakeup_wq = false;
+	if (HW_IS_VF(hw)) {
+		dwxgmac2_vf_set_async_info(hw, dev, wakeup_wq, DN200_VF_SET_FLT);
+	} else {
+		ret = dwxgamc_comp_change(hw, dev);
+		if (!ret)
+			return;
+		*wakeup_wq = true;
+	}
+}
+
+static void dwxgmac2_wq_set_filter(struct mac_device_info *hw,
+				      struct net_device *dev, bool is_vf,
+					  struct dn200_vf_rxp_async_info *async_info)
+{
+	unsigned long bitmap_am;
+	unsigned long bitmap_pm;
+	u8 rxq_start = is_vf ? async_info->rxq_start : 0;
+	u8 i = 0;
+	u64 flags = is_vf ? async_info->flags : dev->flags;
+
+	if (!is_vf)
+		dn200_clear_lram_pf_uc_rxp(hw);
+
+	/*clear mp and ap */
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, &bitmap_am);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, &bitmap_pm);
+
+	dn200_mc_rxp_channel_route_set(hw, false,
+					 (1 << rxq_start));
+	dn200_clear_mc_da_route(hw, rxq_start);	/*pf and vf */
+
+	if (flags & IFF_ALLMULTI) {
+		bitmap_am |= (1 << rxq_start);
+		dev_dbg(hw->priv->device, "allmulitcast mode\n");
+	} else {
+		bitmap_am &= (~(1 << rxq_start));
+		dev_dbg(hw->priv->device, "no allmulitcast mode\n");
+	}
+	if (flags & IFF_PROMISC) {
+		bitmap_pm |= (1 << rxq_start);
+		bitmap_am |= (1 << rxq_start);
+		dev_dbg(hw->priv->device, "promisc mode\n");
+	} else {
+		bitmap_pm &= (~(1 << rxq_start));
+		dev_dbg(hw->priv->device, "no promisc mode\n");
+	}
+	if (is_vf) {
+		if (async_info->mc_cnt && (async_info->flags & IFF_MULTICAST)) {
+			for (i = 0; i < async_info->mc_cnt; i++)
+				dn200_mc_add_rxp(hw, (u8 *)&async_info->mc_mac_addr[i], rxq_start);
+		}
+	} else {
+		if (!netdev_mc_empty(dev) && (dev->flags & IFF_MULTICAST)) {
+			struct netdev_hw_addr *ha;
+
+			netdev_for_each_mc_addr(ha, dev) {
+				dn200_mc_add_rxp(hw, ha->addr, rxq_start);
+			}
+		}
+
+	}
+
+	if (is_vf) {
+		if (async_info->uc_cnt)
+			bitmap_pm |= (1 << rxq_start);
+	} else {
+		/* Handle multiple unicast addresses */
+		if (netdev_uc_count(dev) > DN200_MAX_UC_MAC_ADDR_NUM) {
+			bitmap_pm |= (1 << DN200_RXQ_START_GET(hw));
+		} else {
+			struct netdev_hw_addr *ha;
+
+			netdev_for_each_uc_addr(ha, dev)
+				dn200_pf_lram_uc_add_rxp(hw, ha->addr);
+		}
+	}
+	dn200_update_bcmc_channel(hw, 0, true, rxq_start);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, bitmap_pm);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, bitmap_am);
+	dn200_mc_rxp_channel_route_set(hw, true, (u16)bitmap_pm);
+	dn200_update_ampm_rxp(hw, rxq_start);
+	if (!is_vf)
+		dwxgamc_get_change(hw, dev);
+	else
+		dwxgamc_set_vf_change(hw, dev, async_info->rxq_start, async_info->seq);
+}
+
+static void dwxgmac2_set_filter_purepf(struct mac_device_info *hw,
+				       struct net_device *dev, u8 *wakeup_wq)
+{
+	void __iomem *ioaddr = (void __iomem *)dev->base_addr;
+	u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
+	int mcbitslog2 = hw->mcast_bits_log2;
+	u32 mc_filter[8];
+	int i;
+
+	/*Close vlan filter when promisc on */
+	value &= ~(XGMAC_FILTER_PR | XGMAC_FILTER_HMC |
+		   XGMAC_FILTER_PM | XGMAC_FILTER_PCF | XGMAC_FILTER_VTFE);
+	value |= XGMAC_FILTER_HPF;
+
+	if ((dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
+	    !(dev->flags & IFF_PROMISC)) {
+		value |= XGMAC_FILTER_VTFE;
+	}
+
+	memset(mc_filter, 0, sizeof(mc_filter));
+	value |= (0x3 << XGMAC_FILTER_PCF_SHIFT);
+	if (dev->flags & IFF_PROMISC) {
+		value |= XGMAC_FILTER_PR;
+		value &= ~XGMAC_FILTER_PCF;
+		value |= (0x2 << XGMAC_FILTER_PCF_SHIFT);
+	} else if ((dev->flags & IFF_ALLMULTI) ||
+		   (netdev_mc_count(dev) > hw->multicast_filter_bins)) {
+		value |= XGMAC_FILTER_PM;
+
+		for (i = 0; i < XGMAC_MAX_HASH_TABLE; i++)
+			writel(~0x0, ioaddr + XGMAC_HASH_TABLE(i));
+	} else if (!netdev_mc_empty(dev) && (dev->flags & IFF_MULTICAST)) {
+		struct netdev_hw_addr *ha;
+
+		value |= XGMAC_FILTER_HMC;
+
+		netdev_for_each_mc_addr(ha, dev) {
+			u32 nr = (bitrev32(~crc32_le(~0, ha->addr, 6)) >>
+				  (32 - mcbitslog2));
+			mc_filter[nr >> 5] |= (1 << (nr & 0x1F));
+		}
+	}
+
+	dwxgmac2_set_mchash(ioaddr, mc_filter, mcbitslog2);
+
+	/* Handle multiple unicast addresses */
+	if (netdev_uc_count(dev) > hw->unicast_filter_entries) {
+		value |= XGMAC_FILTER_PR;
+	} else {
+		struct netdev_hw_addr *ha;
+		int reg = 1;
+
+		netdev_for_each_uc_addr(ha, dev) {
+			dwxgmac2_set_umac_addr(hw, ha->addr, reg, NULL);
+			reg++;
+		}
+
+		for (; reg < XGMAC_ADDR_MAX; reg++) {
+			writel(0, ioaddr + XGMAC_ADDRX_HIGH(reg));
+			writel(0, ioaddr + XGMAC_ADDRX_LOW(reg));
+		}
+	}
+	writel(value, ioaddr + XGMAC_PACKET_FILTER);
+}
+
+static void dwxgmac2_hw_vlan_init(struct mac_device_info *hw);
+static void dwxgmac2_sriov_init_rxp_vlan_route(struct mac_device_info *hw)
+{
+	u32 channel = 0;
+	u32 vlan_tag = 0;
+	u32 proto = 0;
+	struct RXP_FPR_ENTRY frp_entry_data[2];
+
+	if (HW_IS_VF(hw))
+		return;
+
+	if (!HW_IS_VF(hw))
+		dwxgmac2_hw_vlan_init(hw);
+
+	channel = (1 << DN200_RXQ_START_GET(hw));
+	proto = (htons(ETH_P_8021Q) & 0xffff);
+	/*Init vid proto entry */
+	memset(&frp_entry_data[0], 0, sizeof(struct RXP_FPR_ENTRY));
+	vlan_tag =
+	    (htons((u32)0 & 0xfff) << 16) | (htons(ETH_P_8021Q) & 0xffff);
+	frp_entry_data[0].match_data = cpu_to_le32(proto);
+	frp_entry_data[0].match_en = 0xffff;
+	frp_entry_data[0].af = 1;
+	frp_entry_data[0].rf = 1;
+	frp_entry_data[0].nc = 0;
+	frp_entry_data[0].im = 1;
+	frp_entry_data[0].dma_ch_no = channel;
+	frp_entry_data[0].ok_index = DN200_VLAN_ADDR_START + 1;	//to all drop
+	frp_entry_data[0].frame_offset = 3;
+
+	/*Init first vid entry, vid 0 */
+	memset(&frp_entry_data[1], 0, sizeof(struct RXP_FPR_ENTRY));
+	frp_entry_data[1].match_data = vlan_tag;
+	frp_entry_data[1].match_en = 0xffffffff;
+	frp_entry_data[1].af = 1;
+	frp_entry_data[1].rf = 1;
+	frp_entry_data[1].nc = 1;
+	frp_entry_data[1].dma_ch_no = channel;
+	frp_entry_data[1].ok_index = DN200_ALL_DROP_OFF;	//to all drop
+	frp_entry_data[1].frame_offset = 3;
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  DN200_VLAN_ADDR_START);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  DN200_VLAN_ADDR_START + 1);
+	dwxgmac3_rxp_enable(hw->pcsr);
+}
+
+static int dwxgmac2_pf_add_rxp_vlan_route(struct net_device *dev,
+					  struct mac_device_info *hw,
+					  __be16 proto, u16 vid, uint8_t off,
+					  bool is_last)
+{
+	u32 channel = 0;
+	u32 vlan_tag = 0;
+	u8 rxp_off = 0;
+	u16 oki = 0;
+	struct RXP_FPR_ENTRY frp_entry_data[1];
+
+	if (HW_IS_VF(hw))
+		return 0;
+	rxp_off = DN200_VLAN_ADDR_START + off + 1;
+	oki = rxp_off + 1;
+
+	if (is_last)
+		oki = DN200_ALL_DROP_OFF;	//to all drop
+	channel = (1 << DN200_RXQ_START_GET(hw));
+	vlan_tag = (htons((u32)vid & 0xfff) << 16);
+	memset(&frp_entry_data[0], 0, sizeof(struct RXP_FPR_ENTRY));
+	frp_entry_data[0].match_data = cpu_to_le32(vlan_tag);
+	frp_entry_data[0].match_en = 0xff0f0000;
+	frp_entry_data[0].af = 1;
+	frp_entry_data[0].rf = 1;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].dma_ch_no = channel;
+
+	frp_entry_data[0].ok_index = oki;
+	frp_entry_data[0].frame_offset = 3;
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  rxp_off);
+	return 0;
+}
+
+static int dwxgmac2_pf_del_rxp_vlan_route(struct net_device *dev,
+					  struct mac_device_info *hw,
+					  __be16 proto, u16 vid, uint8_t off,
+					  bool is_last)
+{
+	u32 channel = 0;
+	u8 rxp_off = 0;
+	u16 oki = 0;
+	struct RXP_FPR_ENTRY frp_entry_data[1];
+
+	if (HW_IS_VF(hw))
+		return 0;
+	rxp_off = DN200_VLAN_ADDR_START + off + 1;
+	oki = rxp_off + 1;
+	if (is_last)
+		oki = DN200_ALL_DROP_OFF;	//to all drop
+
+	channel = (1 << DN200_RXQ_START_GET(hw));
+	memset(&frp_entry_data[0], 0, sizeof(struct RXP_FPR_ENTRY));
+	frp_entry_data[0].match_data = 0;
+	frp_entry_data[0].match_en = 0xffff0000;
+	frp_entry_data[0].af = 1;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].dma_ch_no = channel;
+
+	frp_entry_data[0].ok_index = oki;
+	frp_entry_data[0].frame_offset = 3;
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  rxp_off);
+	return 0;
+}
+
+static void dwxgmac2_rxp_vlan_filter_config(struct mac_device_info *hw,
+					    bool enable)
+{
+	u32 data;
+	unsigned long bitmap;
+	u32 prev_off_val = 0;
+	int prev_off;
+	u8 off = 1 + 1 + 15;
+	u8 i = 1 + 1 + 15;
+
+	if (HW_IS_VF(hw))
+		return;
+
+	if (hw->priv->plat_ex->vlan_num > (hw->max_vlan_num + 1))
+		enable = false;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	while (i > 0) {
+		prev_off = dn200_get_prev_used_bit(&bitmap, off);
+		prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+		if (!prev_off)
+			break;
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    AFRFNC_ENTRY_OFFSET
+						    (prev_off_val + 1));
+		if (enable) {
+			data &= (~RF_ENABLE);
+			data &= (~AF_ENABLE);
+		} else {
+			data |= RF_ENABLE;
+			data |= AF_ENABLE;
+		}
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       AFRFNC_ENTRY_OFFSET
+						       (prev_off_val + 1));
+
+		off = prev_off;
+		i--;
+	}
+}
+
+static int dwxgmac2_set_mac_loopback(void __iomem *ioaddr, bool enable)
+{
+	u32 value = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	if (enable)
+		value |= XGMAC_CONFIG_LM;
+	else
+		value &= ~XGMAC_CONFIG_LM;
+
+	writel(value, ioaddr + XGMAC_RX_CONFIG);
+	return 0;
+}
+
+static int dwxgmac2_rss_write_reg(void __iomem *ioaddr, bool is_key, int idx,
+				  u32 val)
+{
+	u32 ctrl = 0;
+	int ret = 0;
+	int retry = 0;
+
+retry:
+	writel(val, ioaddr + XGMAC_RSS_DATA);
+	ctrl |= (idx & 0xFFF) << XGMAC_RSSIA_SHIFT;
+	ctrl |= is_key ? XGMAC_ADDRT : 0x0;
+	ctrl |= XGMAC_OB;
+	writel(ctrl, ioaddr + XGMAC_RSS_ADDR);
+
+	ret = readl_poll_timeout_atomic(ioaddr + XGMAC_RSS_ADDR, ctrl,
+					!(ctrl & XGMAC_OB), 10, 10000);
+	if (ret && !retry) {
+		retry++;
+		writel(0, ioaddr + XGMAC_RSS_ADDR);
+		goto retry;
+	}
+	return ret;
+}
+
+static int dwxgmac2_rss_configure(struct mac_device_info *hw,
+				  struct dn200_rss *cfg, u32 num_rxq)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, *key;
+	int i, ret;
+	u32 *table = NULL;
+	u32 *tmp_table = NULL;
+
+	if (HW_IS_VF(hw))
+		return 0;
+	value = readl(ioaddr + XGMAC_RSS_CTRL);
+	if (!PRIV_SRIOV_SUPPORT(hw->priv)) {
+		if (!cfg || !cfg->enable) {
+			value &= ~XGMAC_RSSE;
+			writel(value, ioaddr + XGMAC_RSS_CTRL);
+			return 0;
+		}
+		tmp_table = cfg->table;
+	} else {
+		if (!cfg) {
+			return 0;
+		} else if (cfg && !cfg->enable) {
+			table = devm_kzalloc(hw->priv->device,
+				 sizeof(u32) * DN200_RSS_MAX_TABLE_SIZE, GFP_ATOMIC);
+			if (!table)
+				return -ENOMEM;
+			tmp_table = table;
+		} else {
+			tmp_table = cfg->table;
+		}
+	}
+	key = (u32 *)cfg->key;
+	for (i = 0; i < (ARRAY_SIZE(cfg->key) / sizeof(u32)); i++) {
+		ret = dwxgmac2_rss_write_reg(ioaddr, true, i, key[i]);
+		if (ret) {
+			value = readl(ioaddr + XGMAC_RSS_ADDR);
+			netdev_err(hw->priv->dev,
+				   "%s: %d poll_timeout.RSS_ADDR's val = 0x%x\n",
+				   __func__, __LINE__, value);
+			goto free_table;
+		}
+	}
+
+	for (i = 0; i < DN200_RSS_MAX_TABLE_SIZE; i++) {
+		ret = dwxgmac2_rss_write_reg(ioaddr, false, i, tmp_table[i]);
+		if (ret) {
+			value = readl(ioaddr + XGMAC_RSS_ADDR);
+			netdev_err(hw->priv->dev,
+				   "%s: %d poll_timeout.RSS_ADDR's val = 0x%x\n",
+				   __func__, __LINE__, value);
+			goto free_table;
+		}
+	}
+
+	if (cfg->rss_flags & DN200_RSS_IP2TE)
+		value |= XGMAC_IP2TE;
+
+	if (cfg->rss_flags & DN200_RSS_UDP4TE)
+		value |= XGMAC_UDP4TE;
+
+	if (cfg->rss_flags & DN200_RSS_TCP4TE)
+		value |= XGMAC_TCP4TE;
+
+	value |= XGMAC_RSSE;
+	writel(value, ioaddr + XGMAC_RSS_CTRL);
+
+free_table:
+	if (table)
+		devm_kfree(hw->priv->device, table);
+	return ret;
+}
+
+static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash,
+				      __le16 perfect_match, bool is_double)
+{
+	void __iomem *ioaddr = hw->pcsr;
+
+	writel(hash, ioaddr + XGMAC_VLAN_HASH_TABLE);
+	if (HW_IS_VF(hw))
+		return;
+	if (hash) {
+		u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
+
+		value |= XGMAC_FILTER_VTFE;
+
+		writel(value, ioaddr + XGMAC_PACKET_FILTER);
+
+		value = readl(ioaddr + XGMAC_VLAN_TAG);
+
+		value |= XGMAC_VLAN_VTHM | XGMAC_VLAN_ETV;
+		if (is_double) {
+			value |= XGMAC_VLAN_EDVLP;
+			value |= XGMAC_VLAN_ESVL;
+			value |= XGMAC_VLAN_DOVLTC;
+		} else {
+			value &= ~XGMAC_VLAN_EDVLP;
+			value &= ~XGMAC_VLAN_ESVL;
+			value &= ~XGMAC_VLAN_DOVLTC;
+		}
+
+		value &= ~XGMAC_VLAN_VID;
+		writel(value, ioaddr + XGMAC_VLAN_TAG);
+	} else if (perfect_match) {
+		u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
+
+		value |= XGMAC_FILTER_VTFE;
+
+		writel(value, ioaddr + XGMAC_PACKET_FILTER);
+
+		value = readl(ioaddr + XGMAC_VLAN_TAG);
+
+		value &= ~XGMAC_VLAN_VTHM;
+		value |= XGMAC_VLAN_ETV;
+		if (is_double) {
+			value |= XGMAC_VLAN_EDVLP;
+			value |= XGMAC_VLAN_ESVL;
+			value |= XGMAC_VLAN_DOVLTC;
+		} else {
+			value &= ~XGMAC_VLAN_EDVLP;
+			value &= ~XGMAC_VLAN_ESVL;
+			value &= ~XGMAC_VLAN_DOVLTC;
+		}
+
+		value &= ~XGMAC_VLAN_VID;
+		writel(value | perfect_match, ioaddr + XGMAC_VLAN_TAG);
+	} else {
+		u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
+
+		value &= ~XGMAC_FILTER_VTFE;
+
+		writel(value, ioaddr + XGMAC_PACKET_FILTER);
+
+		value = readl(ioaddr + XGMAC_VLAN_TAG);
+
+		value &= ~(XGMAC_VLAN_VTHM | XGMAC_VLAN_ETV);
+		value &= ~(XGMAC_VLAN_EDVLP | XGMAC_VLAN_ESVL);
+		value &= ~XGMAC_VLAN_DOVLTC;
+		value &= ~XGMAC_VLAN_VID;
+
+		writel(value, ioaddr + XGMAC_VLAN_TAG);
+	}
+}
+
+struct dwxgmac3_error_desc {
+	bool valid;
+	const char *desc;
+	const char *detailed_desc;
+};
+
+#define STAT_OFF(field)		offsetof(struct dn200_safety_stats, field)
+
+static void dwxgmac3_log_error(struct net_device *ndev, u32 value, bool corr,
+			       const char *module_name,
+			       const struct dwxgmac3_error_desc *desc,
+			       unsigned long field_offset,
+			       struct dn200_safety_stats *stats)
+{
+	unsigned long loc, mask;
+	u8 *bptr = (u8 *)stats;
+	unsigned long *ptr;
+
+	ptr = (unsigned long *)(bptr + field_offset);
+
+	mask = value;
+	for_each_set_bit(loc, &mask, 32) {
+		netdev_err(ndev, "Found %s error in %s: '%s: %s'\n", corr ?
+			   "correctable" : "uncorrectable", module_name,
+			   desc[loc].desc, desc[loc].detailed_desc);
+
+		/* Update counters */
+		ptr[loc]++;
+	}
+}
+
+static const struct dwxgmac3_error_desc dwxgmac3_mac_errors[32] = {
+	{ true, "ATPES", "Application Transmit Interface Parity Check Error" },
+	{ true, "DPES", "Descriptor Cache Data Path Parity Check Error" },
+	{ true, "TPES", "TSO Data Path Parity Check Error" },
+	{ true, "TSOPES", "TSO Header Data Path Parity Check Error" },
+	{ true, "MTPES", "MTL Data Path Parity Check Error" },
+	{ true, "MTSPES", "MTL TX Status Data Path Parity Check Error" },
+	{ true, "MTBUPES", "MAC TBU Data Path Parity Check Error" },
+	{ true, "MTFCPES", "MAC TFC Data Path Parity Check Error" },
+	{ true, "ARPES",
+	 "Application Receive Interface Data Path Parity Check Error" },
+	{ true, "MRWCPES", "MTL RWC Data Path Parity Check Error" },
+	{ true, "MRRCPES", "MTL RCC Data Path Parity Check Error" },
+	{ true, "CWPES", "CSR Write Data Path Parity Check Error" },
+	{ true, "ASRPES", "AXI Slave Read Data Path Parity Check Error" },
+	{ true, "TTES", "TX FSM Timeout Error" },
+	{ true, "RTES", "RX FSM Timeout Error" },
+	{ true, "CTES", "CSR FSM Timeout Error" },
+	{ true, "ATES", "APP FSM Timeout Error" },
+	{ true, "PTES", "PTP FSM Timeout Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 18 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 19 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 20 */
+	{ true, "MSTTES", "Master Read/Write Timeout Error" },
+	{ true, "SLVTES", "Slave Read/Write Timeout Error" },
+	{ true, "ATITES", "Application Timeout on ATI Interface Error" },
+	{ true, "ARITES", "Application Timeout on ARI Interface Error" },
+	{ true, "FSMPES", "FSM State Parity Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 26 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 27 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 28 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 29 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 30 */
+	{ true, "CPI", "Control Register Parity Check Error" },
+};
+
+static void dwxgmac3_handle_mac_err(struct net_device *ndev,
+				    void __iomem *ioaddr, bool correctable,
+				    struct dn200_safety_stats *stats)
+{
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_MAC_DPP_FSM_INT_STATUS);
+	writel(value, ioaddr + XGMAC_MAC_DPP_FSM_INT_STATUS);
+	netdev_err(ndev, "dwxgmac come across mac err REG=%#x value=%#x\n",
+		   XGMAC_MAC_DPP_FSM_INT_STATUS, value);
+	dwxgmac3_log_error(ndev, value, correctable, "MAC", dwxgmac3_mac_errors,
+			   STAT_OFF(mac_errors), stats);
+}
+
+static const struct dwxgmac3_error_desc dwxgmac3_mtl_errors[32] = {
+	{ true, "TXCES", "MTL TX Memory Error" },
+	{ true, "TXAMS", "MTL TX Memory Address Mismatch Error" },
+	{ true, "TXUES", "MTL TX Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 3 */
+	{ true, "RXCES", "MTL RX Memory Error" },
+	{ true, "RXAMS", "MTL RX Memory Address Mismatch Error" },
+	{ true, "RXUES", "MTL RX Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 7 */
+	{ true, "ECES", "MTL EST Memory Error" },
+	{ true, "EAMS", "MTL EST Memory Address Mismatch Error" },
+	{ true, "EUES", "MTL EST Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 11 */
+	{ true, "RPCES", "MTL RX Parser Memory Error" },
+	{ true, "RPAMS", "MTL RX Parser Memory Address Mismatch Error" },
+	{ true, "RPUES", "MTL RX Parser Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 15 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 16 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 17 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 18 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 19 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 20 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 21 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 22 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 23 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 24 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 25 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 26 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 27 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 28 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 29 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 30 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 31 */
+};
+
+static void dwxgmac3_handle_mtl_err(struct net_device *ndev,
+				    void __iomem *ioaddr, bool correctable,
+				    struct dn200_safety_stats *stats)
+{
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_MTL_ECC_INT_STATUS);
+	writel(value, ioaddr + XGMAC_MTL_ECC_INT_STATUS);
+	netdev_err(ndev, "dwxgmac come across mtl err REG=%#x value=%#x\n",
+		   XGMAC_MTL_ECC_INT_STATUS, value);
+	dwxgmac3_log_error(ndev, value, correctable, "MTL", dwxgmac3_mtl_errors,
+			   STAT_OFF(mtl_errors), stats);
+}
+
+static const struct dwxgmac3_error_desc dwxgmac3_dma_errors[32] = {
+	{ true, "TCES", "DMA TSO Memory Error" },
+	{ true, "TAMS", "DMA TSO Memory Address Mismatch Error" },
+	{ true, "TUES", "DMA TSO Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 3 */
+	{ true, "DCES", "DMA DCACHE Memory Error" },
+	{ true, "DAMS", "DMA DCACHE Address Mismatch Error" },
+	{ true, "DUES", "DMA DCACHE Memory Error" },
+	{ false, "UNKNOWN", "Unknown Error" },	/* 7 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 8 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 9 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 10 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 11 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 12 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 13 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 14 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 15 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 16 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 17 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 18 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 19 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 20 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 21 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 22 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 23 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 24 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 25 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 26 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 27 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 28 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 29 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 30 */
+	{ false, "UNKNOWN", "Unknown Error" },	/* 31 */
+};
+
+static void dwxgmac3_handle_dma_err(struct net_device *ndev,
+				    void __iomem *ioaddr, bool correctable,
+				    struct dn200_safety_stats *stats)
+{
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_DMA_ECC_INT_STATUS);
+	writel(value, ioaddr + XGMAC_DMA_ECC_INT_STATUS);
+	netdev_err(ndev, "dwxgmac come across dma err REG=%#x value=%#x\n",
+		   XGMAC_DMA_ECC_INT_STATUS, value);
+	dwxgmac3_log_error(ndev, value, correctable, "DMA", dwxgmac3_dma_errors,
+			   STAT_OFF(dma_errors), stats);
+}
+
+static void dn200_feat_config_even_parity_check(void __iomem *ioaddr)
+{
+	u32 value;
+	/* 2. Change to even parity check */
+	value = readl(ioaddr + XGMAC_MTL_DPP_CONTROL);
+	value &= 0xFFFFFFFD;
+	writel(value, ioaddr + XGMAC_MTL_DPP_CONTROL);
+}
+
+static int
+dwxgmac3_safety_feat_config(void __iomem *ioaddr, unsigned int asp,
+			    struct dn200_safety_feature_cfg *safety_cfg,
+			    struct mac_device_info *hw)
+{
+	u32 value;
+
+	if (!asp)
+		return -EINVAL;
+	if (HW_IS_VF(hw))
+		return 0;
+	/* 1. Enable Safety Features */
+	writel(0x0, ioaddr + XGMAC_MTL_ECC_CONTROL);
+
+	dn200_feat_config_even_parity_check(ioaddr);
+
+	/* 2. Enable MTL Safety Interrupts */
+	value = readl(ioaddr + XGMAC_MTL_ECC_INT_ENABLE);
+	value |= XGMAC_RPCEIE;	/* RX Parser Memory Correctable Error */
+	value |= XGMAC_ECEIE;	/* EST Memory Correctable Error */
+	value |= XGMAC_RXCEIE;	/* RX Memory Correctable Error */
+	value |= XGMAC_TXCEIE;	/* TX Memory Correctable Error */
+	writel(value, ioaddr + XGMAC_MTL_ECC_INT_ENABLE);
+
+	/* 3. Enable DMA Safety Interrupts */
+	value = readl(ioaddr + XGMAC_DMA_ECC_INT_ENABLE);
+	value |= XGMAC_DCEIE;	/* Descriptor Cache Memory Correctable Error */
+	value |= XGMAC_TCEIE;	/* TSO Memory Correctable Error */
+	writel(value, ioaddr + XGMAC_DMA_ECC_INT_ENABLE);
+
+	/* Only ECC Protection for External Memory feature is selected */
+	if (asp <= 0x1)
+		return 0;
+
+	/* 4. Enable Parity and Timeout for FSM */
+	value = readl(ioaddr + XGMAC_MAC_FSM_CONTROL);
+	value |= XGMAC_PRTYEN;	/* FSM Parity Feature */
+	value |= XGMAC_TMOUTEN;	/* FSM Timeout Feature */
+	writel(value, ioaddr + XGMAC_MAC_FSM_CONTROL);
+
+	return 0;
+}
+
+static int dwxgmac3_safety_feat_irq_status(struct net_device *ndev,
+					   void __iomem *ioaddr,
+					   unsigned int asp,
+					   struct dn200_safety_stats *stats)
+{
+	bool err, corr;
+	u32 mtl, dma;
+	int ret = 0;
+
+	if (!asp)
+		return -EINVAL;
+
+	mtl = readl(ioaddr + XGMAC_MTL_SAFETY_INT_STATUS);
+	dma = readl(ioaddr + XGMAC_DMA_SAFETY_INT_STATUS);
+
+	err = (mtl & XGMAC_MCSIS) || (dma & XGMAC_MCSIS);
+	corr = false;
+	if (err) {
+		dwxgmac3_handle_mac_err(ndev, ioaddr, corr, stats);
+		ret |= !corr;
+	}
+
+	err = (mtl & (XGMAC_MEUIS | XGMAC_MECIS)) ||
+	    (dma & (XGMAC_MSUIS | XGMAC_MSCIS));
+	corr = (mtl & XGMAC_MECIS) || (dma & XGMAC_MSCIS);
+	if (err) {
+		dwxgmac3_handle_mtl_err(ndev, ioaddr, corr, stats);
+		ret |= !corr;
+	}
+
+	err = dma & (XGMAC_DEUIS | XGMAC_DECIS);
+	corr = dma & XGMAC_DECIS;
+	if (err) {
+		dwxgmac3_handle_dma_err(ndev, ioaddr, corr, stats);
+		ret |= !corr;
+	}
+
+	return ret;
+}
+
+static const struct dwxgmac3_error {
+	const struct dwxgmac3_error_desc *desc;
+} dwxgmac3_all_errors[] = {
+	{ dwxgmac3_mac_errors },
+	{ dwxgmac3_mtl_errors },
+	{ dwxgmac3_dma_errors },
+};
+
+static int dwxgmac3_safety_feat_dump(struct dn200_safety_stats *stats,
+				     int index, unsigned long *count,
+				     const char **desc)
+{
+	int module = index / 32, offset = index % 32;
+	unsigned long *ptr = (unsigned long *)stats;
+
+	if (module >= ARRAY_SIZE(dwxgmac3_all_errors))
+		return -EINVAL;
+	if (!dwxgmac3_all_errors[module].desc[offset].valid)
+		return -EINVAL;
+	if (count)
+		*count = *(ptr + index);
+	if (desc)
+		*desc = dwxgmac3_all_errors[module].desc[offset].desc;
+	return 0;
+}
+
+static int dwxgmac2_get_mac_tx_timestamp(struct mac_device_info *hw, u64 *ts)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_TIMESTAMP_STATUS);
+	if (!(value & XGMAC_TXTSC))
+		return -EBUSY;
+	*ts = readl(ioaddr + XGMAC_TXTIMESTAMP_NSEC) & XGMAC_TXTSSTSLO;
+	*ts += readl(ioaddr + XGMAC_TXTIMESTAMP_SEC) * 1000000000ULL;
+	return 0;
+}
+
+static int dwxgmac2_flex_pps_config(void __iomem *ioaddr, int index,
+				    struct dn200_pps_cfg *cfg, bool enable,
+				    u32 sub_second_inc, u32 systime_flags)
+{
+	u32 tnsec = readl(ioaddr + XGMAC_PPSx_TARGET_TIME_NSEC(index));
+	u32 val = readl(ioaddr + XGMAC_PPS_CONTROL);
+	u64 period;
+
+	if (!cfg->available)
+		return -EINVAL;
+	if (tnsec & XGMAC_TRGTBUSY0)
+		return -EBUSY;
+	if (!sub_second_inc || !systime_flags)
+		return -EINVAL;
+
+	val &= ~XGMAC_PPSx_MASK(index);
+
+	if (!enable) {
+		val |= XGMAC_PPSCMDX(index, XGMAC_PPSCMD_STOP);
+		writel(val, ioaddr + XGMAC_PPS_CONTROL);
+		return 0;
+	}
+
+	val |= XGMAC_PPSCMDX(index, XGMAC_PPSCMD_START);
+	val |= XGMAC_TRGTMODSELX(index, XGMAC_PPSCMD_START);
+	val |= XGMAC_PPSEN0;
+
+	writel(cfg->start.tv_sec, ioaddr + XGMAC_PPSx_TARGET_TIME_SEC(index));
+
+	if (!(systime_flags & PTP_TCR_TSCTRLSSR))
+		cfg->start.tv_nsec = (cfg->start.tv_nsec * 1000) / 465;
+	writel(cfg->start.tv_nsec, ioaddr + XGMAC_PPSx_TARGET_TIME_NSEC(index));
+
+	period = cfg->period.tv_sec * 1000000000;
+	period += cfg->period.tv_nsec;
+
+	do_div(period, sub_second_inc);
+
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + XGMAC_PPSx_INTERVAL(index));
+
+	period >>= 1;
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + XGMAC_PPSx_WIDTH(index));
+
+	/* Finally, activate it */
+	writel(val, ioaddr + XGMAC_PPS_CONTROL);
+	return 0;
+}
+
+static void dwxgmac2_sarc_configure(void __iomem *ioaddr, int val)
+{
+	u32 value = readl(ioaddr + XGMAC_TX_CONFIG);
+
+	value &= ~XGMAC_CONFIG_SARC;
+	value |= val << XGMAC_CONFIG_SARC_SHIFT;
+
+	writel(value, ioaddr + XGMAC_TX_CONFIG);
+}
+
+static void dwxgmac2_enable_vlan(struct mac_device_info *hw, u32 type)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_VLAN_INCL);
+	value |= XGMAC_VLAN_VLTI;
+	value &= ~XGMAC_VLAN_CSVL;	/* Only use SVLAN */
+	value &= ~XGMAC_VLAN_VLC;
+	value |= (type << XGMAC_VLAN_VLC_SHIFT) & XGMAC_VLAN_VLC;
+	writel(value, ioaddr + XGMAC_VLAN_INCL);
+}
+
+static int dwxgmac2_filter_wait(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	return readl_poll_timeout_atomic(ioaddr + XGMAC_L3L4_ADDR_CTRL, value,
+					 !(value & XGMAC_XB), 10, 10000);
+}
+
+static int dwxgmac2_filter_read(struct mac_device_info *hw, u32 filter_no,
+				u8 reg, u32 *data)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret;
+
+	ret = dwxgmac2_filter_wait(hw);
+	if (ret)
+		return ret;
+
+	value = (((filter_no & 0x1f) << XGMAC_IDDR_FNUM) | reg) << XGMAC_IDDR_SHIFT;
+	value |= XGMAC_TT | XGMAC_XB;
+	writel(value, ioaddr + XGMAC_L3L4_ADDR_CTRL);
+
+	ret = dwxgmac2_filter_wait(hw);
+	if (ret)
+		return ret;
+
+	*data = readl(ioaddr + XGMAC_L3L4_DATA);
+	return 0;
+}
+
+static int dwxgmac2_filter_write(struct mac_device_info *hw, u32 filter_no,
+				 u8 reg, u32 data)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int retry = 0;
+	int ret;
+
+retry:
+	ret = dwxgmac2_filter_wait(hw);
+	if (ret)
+		return ret;
+
+	writel(data, ioaddr + XGMAC_L3L4_DATA);
+
+	value = (((filter_no & 0x1f) << XGMAC_IDDR_FNUM) | reg) << XGMAC_IDDR_SHIFT;
+	value |= XGMAC_XB;
+	writel(value, ioaddr + XGMAC_L3L4_ADDR_CTRL);
+	ret = dwxgmac2_filter_wait(hw);
+	if (ret && !retry) {
+		writel(0, ioaddr + XGMAC_L3L4_ADDR_CTRL);
+		retry++;
+		goto retry;
+	} else if (ret) {
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+	}
+	return ret;
+}
+
+static int dwxgmac2_config_l3_filter(struct mac_device_info *hw, u32 filter_no,
+				     bool en, bool ipv6, bool sa, bool inv,
+				     u32 match)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret;
+
+	if (HW_IS_VF(hw))
+		return 0;
+	if (!HW_IS_PUREPF(hw)) {
+		unsigned long bitmap_l3l4 = 0;
+		int offset = 0;
+
+		offset = dn200_get_l3l4_filter_offset(hw, filter_no);
+		if (offset == -1) {
+			if (en) {
+				DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      &bitmap_l3l4);
+				offset =
+				    dn200_get_unused_bit(&bitmap_l3l4, 31, 0);
+				bitmap_set((unsigned long *)&bitmap_l3l4,
+					   offset, 1);
+				DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      bitmap_l3l4);
+				dn200_set_l3l4_filter_info(hw, filter_no,
+							   offset, false);
+				filter_no = offset;
+			} else {
+				return -1;
+			}
+		} else {
+			filter_no = offset;
+			if (!en) {
+				DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      &bitmap_l3l4);
+				bitmap_clear((unsigned long *)&bitmap_l3l4,
+					     offset, 1);
+				DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      bitmap_l3l4);
+				dn200_set_l3l4_filter_info(hw, filter_no,
+							   offset, true);
+			}
+		}
+	}
+
+	value = readl(ioaddr + XGMAC_PACKET_FILTER);
+	value |= XGMAC_FILTER_IPFE;
+	writel(value, ioaddr + XGMAC_PACKET_FILTER);
+
+	ret = dwxgmac2_filter_read(hw, filter_no, XGMAC_L3L4_CTRL, &value);
+	if (ret)
+		return ret;
+
+	/* For IPv6 not both SA/DA filters can be active */
+	if (ipv6) {
+		value |= XGMAC_L3PEN0;
+		value &= ~(XGMAC_L3SAM0 | XGMAC_L3SAIM0);
+		value &= ~(XGMAC_L3DAM0 | XGMAC_L3DAIM0);
+		if (sa) {
+			value |= XGMAC_L3SAM0;
+			if (inv)
+				value |= XGMAC_L3SAIM0;
+		} else {
+			value |= XGMAC_L3DAM0;
+			if (inv)
+				value |= XGMAC_L3DAIM0;
+		}
+	} else {
+		value &= ~XGMAC_L3PEN0;
+		if (sa) {
+			value |= XGMAC_L3SAM0;
+			if (inv)
+				value |= XGMAC_L3SAIM0;
+		} else {
+			value |= XGMAC_L3DAM0;
+			if (inv)
+				value |= XGMAC_L3DAIM0;
+		}
+	}
+
+	ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, value);
+	if (ret)
+		return ret;
+
+	if (sa) {
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR0, match);
+		if (ret)
+			return ret;
+	} else {
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR1, match);
+		if (ret)
+			return ret;
+	}
+
+	if (!en)
+		return dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, 0);
+	return 0;
+}
+
+static int dwxgmac2_config_l4_filter(struct mac_device_info *hw, u32 filter_no,
+				     bool en, bool udp, bool sa, bool inv,
+				     u32 match)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret;
+
+	if (HW_IS_VF(hw))
+		return 0;
+	if (!HW_IS_PUREPF(hw)) {
+		unsigned long bitmap_l3l4 = 0;
+		int offset = 0;
+
+		offset = dn200_get_l3l4_filter_offset(hw, filter_no);
+		if (offset == -1) {
+			if (en) {
+				DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      &bitmap_l3l4);
+				offset =
+				    dn200_get_unused_bit(&bitmap_l3l4, 31, 0);
+				bitmap_set((unsigned long *)&bitmap_l3l4,
+					   offset, 1);
+				DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      bitmap_l3l4);
+				dn200_set_l3l4_filter_info(hw, filter_no,
+							   offset, false);
+				filter_no = offset;
+			} else {
+				return -1;
+			}
+		} else {
+			filter_no = offset;
+			if (!en) {
+				DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      &bitmap_l3l4);
+				bitmap_clear((unsigned long *)&bitmap_l3l4,
+					     offset, 1);
+				DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_l3l4,
+							      bitmap_l3l4);
+				dn200_set_l3l4_filter_info(hw, filter_no,
+							   offset, true);
+			}
+		}
+	}
+
+	value = readl(ioaddr + XGMAC_PACKET_FILTER);
+	value |= XGMAC_FILTER_IPFE;
+	writel(value, ioaddr + XGMAC_PACKET_FILTER);
+
+	ret = dwxgmac2_filter_read(hw, filter_no, XGMAC_L3L4_CTRL, &value);
+	if (ret)
+		return ret;
+
+	if (udp)
+		value |= XGMAC_L4PEN0;
+	else
+		value &= ~XGMAC_L4PEN0;
+
+	value &= ~(XGMAC_L4SPM0 | XGMAC_L4SPIM0);
+	value &= ~(XGMAC_L4DPM0 | XGMAC_L4DPIM0);
+	if (sa) {
+		value |= XGMAC_L4SPM0;
+		if (inv)
+			value |= XGMAC_L4SPIM0;
+	} else {
+		value |= XGMAC_L4DPM0;
+		if (inv)
+			value |= XGMAC_L4DPIM0;
+	}
+
+	ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, value);
+	if (ret)
+		return ret;
+
+	if (sa) {
+		value = match & XGMAC_L4SP0;
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L4_ADDR, value);
+		if (ret)
+			return ret;
+	} else {
+		value = (match << XGMAC_L4DP0_SHIFT) & XGMAC_L4DP0;
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L4_ADDR, value);
+		if (ret)
+			return ret;
+	}
+
+	if (!en)
+		return dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, 0);
+
+	return 0;
+}
+
+static int dwxgmac2_config_ntuple_filter(struct mac_device_info *hw,
+					 u32 filter_no,
+					 struct dn200_fdir_filter *input,
+					 bool en)
+{
+	u32 value = 0;
+	int ret;
+	int mask;
+	u32 data = 0;
+
+	if (HW_IS_VF(hw))
+		return 0;
+	if (!en)
+		return dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, 0);
+
+	if (!(input->flow_type & (DN200_FLOW_TYPE_SA | DN200_FLOW_TYPE_DA |
+			  DN200_FLOW_TYPE_SPORT | DN200_FLOW_TYPE_DPORT)))
+		return -1;
+	if (input->action & DN200_FLOW_ACTION_ROUTE) {
+		value |= XGMAC_DMCHEN;
+		value |= ((input->queue << XGMAC_DMCHN_SHIFT) & XGMAC_DMCHN);
+	}
+	/* For IPv6 not both SA/DA filters can be active */
+	if (input->flow_type & DN200_FLOW_TYPE_V6) {
+		value |= XGMAC_L3PEN0;
+		value &= ~(XGMAC_L3SAM0 | XGMAC_L3SAIM0);
+		value &= ~(XGMAC_L3DAM0 | XGMAC_L3DAIM0);
+		if (input->flow_type & DN200_FLOW_TYPE_SA)
+			value |= XGMAC_L3SAM0;
+		else if (input->flow_type & DN200_FLOW_TYPE_DA)
+			value |= XGMAC_L3DAM0;
+		else
+			goto set_port;
+		mask = input->xgmac_mask_src;
+		value |= ((mask << XGMAC_L3HSBM0_SHIFT) & XGMAC_L3HSBM0_V6);
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR0,
+					  ntohl(input->ip6[3]));
+		if (ret)
+			return ret;
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR1,
+					  ntohl(input->ip6[2]));
+		if (ret)
+			return ret;
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR2,
+					  ntohl(input->ip6[1]));
+		if (ret)
+			return ret;
+		ret =
+		    dwxgmac2_filter_write(hw, filter_no, XMGAC_L3_ADDR3,
+					  ntohl(input->ip6[0]));
+		if (ret)
+			return ret;
+	}
+	if (input->flow_type & DN200_FLOW_TYPE_V4) {
+		value &= ~XGMAC_L3PEN0;
+		if (input->flow_type & DN200_FLOW_TYPE_SA) {
+			value |= XGMAC_L3SAM0;
+			/*get src ip mask */
+			mask = (input->xgmac_mask_src);
+			value |=
+			    ((mask << XGMAC_L3HSBM0_SHIFT) & XGMAC_L3HSBM0);
+			ret =
+			    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR0,
+						  ntohl(input->src_ip));
+			if (ret)
+				return ret;
+		}
+		if (input->flow_type & DN200_FLOW_TYPE_DA) {
+			value |= XGMAC_L3DAM0;
+			/*get dst ip mask */
+			mask = (input->xgmac_mask_dst);
+			value |=
+			    ((mask << XGMAC_L3HDBM0_SHIFT) & XGMAC_L3HDBM0);
+			ret =
+			    dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR1,
+						  ntohl(input->dst_ip));
+			if (ret)
+				return ret;
+		}
+	}
+
+set_port:
+	if (input->flow_type & DN200_FLOW_TYPE_UDP)
+		value |= XGMAC_L4PEN0;
+	else
+		value &= ~XGMAC_L4PEN0;
+
+	value &= ~(XGMAC_L4SPM0 | XGMAC_L4SPIM0);
+	value &= ~(XGMAC_L4DPM0 | XGMAC_L4DPIM0);
+	if (input->flow_type & DN200_FLOW_TYPE_SPORT) {
+		value |= XGMAC_L4SPM0;
+		data |= (ntohs(input->src_port) & XGMAC_L4SP0);
+	}
+
+	if (input->flow_type & DN200_FLOW_TYPE_DPORT) {
+		value |= XGMAC_L4DPM0;
+		data |=
+		    (ntohs(input->dst_port) << XGMAC_L4DP0_SHIFT) & XGMAC_L4DP0;
+	}
+	ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L4_ADDR, data);
+	if (ret)
+		return ret;
+
+	ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, value);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void dwxgmac2_config_l3l4_filter(struct mac_device_info *hw, bool en)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_PACKET_FILTER);
+
+	if (en) {
+		value |= (2 << XGMAC_DHLFRS_SHIFT);
+		value |= XGMAC_FILTER_IPFE;
+	} else {
+		value &= ~(XGMAC_DHLFRS_MASK);
+		value &= ~XGMAC_FILTER_IPFE;
+	}
+	writel(value, ioaddr + XGMAC_PACKET_FILTER);
+	if (en) {
+		/*enable ipv4 l3 filter all pass */
+		dwxgmac2_filter_write(hw, 30, XGMAC_L3L4_CTRL,
+				      XGMAC_L3SAM0 | XGMAC_L3HSBM0);
+		dwxgmac2_filter_write(hw, 30, XGMAC_L3_ADDR0, (0));
+		dwxgmac2_filter_write(hw, 31, XGMAC_L3L4_CTRL,
+				      XGMAC_L3SAM0 | XGMAC_L3HSBM0);
+		dwxgmac2_filter_write(hw, 31, XGMAC_L3_ADDR0, (0x80000000));
+		/*enable ipv6 l3 filter all pass */
+		dwxgmac2_filter_write(hw, 30, XGMAC_L3L4_CTRL,
+				      XGMAC_L3SAM0 | XGMAC_L3HSBM0_V6 |
+				      XGMAC_L3PEN0);
+		dwxgmac2_filter_write(hw, 30, XMGAC_L3_ADDR3, (0));
+		dwxgmac2_filter_write(hw, 31, XGMAC_L3L4_CTRL,
+				      XGMAC_L3SAM0 | XGMAC_L3HSBM0_V6 |
+				      XGMAC_L3PEN0);
+		dwxgmac2_filter_write(hw, 31, XMGAC_L3_ADDR3, (0x80000000));
+	} else {
+		/*disable ipv4 l3 filter all pass entry */
+		dwxgmac2_filter_write(hw, 30, 0, 0);
+		dwxgmac2_filter_write(hw, 31, 0, 0);
+		/*disable ipv6 l3 filter all pass entry */
+		dwxgmac2_filter_write(hw, 29, 0, 0);
+		dwxgmac2_filter_write(hw, 28, 0, 0);
+	}
+}
+
+static void dwxgmac2_set_arp_offload(struct mac_device_info *hw, bool en,
+				     u32 addr)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	writel(addr, ioaddr + XGMAC_ARP_ADDR);
+
+	value = readl(ioaddr + XGMAC_RX_CONFIG);
+	if (en)
+		value |= XGMAC_CONFIG_ARPEN;
+	else
+		value &= ~XGMAC_CONFIG_ARPEN;
+	if (HW_IS_VF(hw))
+		return;
+	writel(value, ioaddr + XGMAC_RX_CONFIG);
+}
+
+static int dwxgmac3_est_write(void __iomem *ioaddr, u32 reg, u32 val, bool gcl)
+{
+	u32 ctrl;
+
+	writel(val, ioaddr + XGMAC_MTL_EST_GCL_DATA);
+
+	ctrl = (reg << XGMAC_ADDR_SHIFT);
+	ctrl |= gcl ? 0 : XGMAC_GCRR;
+
+	writel(ctrl, ioaddr + XGMAC_MTL_EST_GCL_CONTROL);
+
+	ctrl |= XGMAC_SRWO;
+	writel(ctrl, ioaddr + XGMAC_MTL_EST_GCL_CONTROL);
+
+	return readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_EST_GCL_CONTROL,
+					 ctrl, !(ctrl & XGMAC_SRWO), 100,
+					 50000);
+}
+
+static int dwxgmac3_est_configure(void __iomem *ioaddr, struct dn200_est *cfg,
+				  unsigned int ptp_rate)
+{
+	int i, ret = 0x0;
+	u32 ctrl;
+
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_BTR_LOW, cfg->btr[0], false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_BTR_HIGH, cfg->btr[1], false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_TER, cfg->ter, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_LLR, cfg->gcl_size, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_CTR_LOW, cfg->ctr[0], false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_CTR_HIGH, cfg->ctr[1], false);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < cfg->gcl_size; i++) {
+		ret = dwxgmac3_est_write(ioaddr, i, cfg->gcl[i], true);
+		if (ret)
+			return ret;
+	}
+
+	ctrl = readl(ioaddr + XGMAC_MTL_EST_CONTROL);
+	ctrl &= ~XGMAC_PTOV;
+	ctrl |= ((1000000000 / ptp_rate) * 9) << XGMAC_PTOV_SHIFT;
+	if (cfg->enable)
+		ctrl |= XGMAC_EEST | XGMAC_SSWL;
+	else
+		ctrl &= ~XGMAC_EEST;
+
+	writel(ctrl, ioaddr + XGMAC_MTL_EST_CONTROL);
+	return 0;
+}
+
+static void dwxgmac3_fpe_configure(void __iomem *ioaddr, u32 num_txq,
+				   u32 num_rxq, bool enable)
+{
+	u32 value;
+
+	if (!enable) {
+		value = readl(ioaddr + XGMAC_FPE_CTRL_STS);
+
+		value &= ~XGMAC_EFPE;
+
+		writel(value, ioaddr + XGMAC_FPE_CTRL_STS);
+		return;
+	}
+
+	value = readl(ioaddr + XGMAC_RXQ_CTRL1);
+	value &= ~XGMAC_RQ;
+	value |= (num_rxq - 1) << XGMAC_RQ_SHIFT;
+	writel(value, ioaddr + XGMAC_RXQ_CTRL1);
+
+	value = readl(ioaddr + XGMAC_FPE_CTRL_STS);
+	value |= XGMAC_EFPE;
+	writel(value, ioaddr + XGMAC_FPE_CTRL_STS);
+}
+
+static int dwxgmac2_indiraccess_wait(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	return readl_poll_timeout_atomic(ioaddr + XGMAC_INDIR_ACCESS_CTRL,
+					 value, !(value & XGMAC_INDIR_OB), 10,
+					 10000);
+}
+
+static int __maybe_unused dwxgmac2_indiraccess_read(struct mac_device_info *hw,
+						    u32 addr_off, u8 mode_sel,
+						    u32 *data)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret;
+
+	ret = dwxgmac2_indiraccess_wait(hw);
+	if (ret)
+		return ret;
+
+	value =
+	    ((addr_off & 0xfff) << XGMAC_INDIR_AOFF_SHIFT) | (mode_sel <<
+						    XGMAC_INDIR_MSEL_SHIFT);
+	value |= XGMAC_INDIR_COM | XGMAC_INDIR_OB;
+	writel(value, ioaddr + XGMAC_INDIR_ACCESS_CTRL);
+
+	ret = dwxgmac2_indiraccess_wait(hw);
+	if (ret)
+		return ret;
+
+	*data = readl(ioaddr + XGMAC_INDRI_ACCESS_DATA);
+	return 0;
+}
+
+static int dwxgmac2_indiraccess_write(struct mac_device_info *hw, u32 addr_off,
+				      u8 mode_sel, u32 data)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+	int ret;
+	int retry = 0;
+
+	ret = dwxgmac2_indiraccess_wait(hw);
+	if (ret)
+		return ret;
+retry:
+	writel(data, ioaddr + XGMAC_INDRI_ACCESS_DATA);
+
+	value =
+	    ((addr_off & 0xfff) << XGMAC_INDIR_AOFF_SHIFT) | (mode_sel <<
+						    XGMAC_INDIR_MSEL_SHIFT);
+	value |= XGMAC_INDIR_OB;
+	writel(value, ioaddr + XGMAC_INDIR_ACCESS_CTRL);
+
+	ret = dwxgmac2_indiraccess_wait(hw);
+	if (ret && !retry) {
+		writel(0, ioaddr + XGMAC_INDIR_ACCESS_CTRL);
+		retry++;
+		goto retry;
+	} else if (ret) {
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+		return -EBUSY;
+	}
+	return ret;
+}
+
+static void dwxgmac2_hw_vlan_init(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value, i;
+	int ret;
+	int rovlt_init = 0;
+
+	if (!HW_IS_PUREPF(hw))
+		rovlt_init = 1;
+	writel(GENMASK(11, 0), ioaddr + XGMAC_RVLAN_LKP_SIZE);
+	for (i = 0; i < VLAN_N_VID - 1; i++) {
+		ret =
+		    dwxgmac2_indiraccess_write(hw, i, XGMAC_INDIR_EXT_ROVTL,
+					       rovlt_init);
+		if (ret)
+			return;
+	}
+	writel(GENMASK(11, 0), ioaddr + XGMAC_RVLAN_LKP_SIZE);
+
+	value = readl(ioaddr + XGMAC_VLAN_TAG);
+	value &= ~XGMAC_VLAN_VTHM;
+	value &= ~XGMAC_VLAN_ETV;
+	/* Enable external Receive Outer VLAN Tag Lookup based perfect filtering and routing */
+	value &= ~XGMAC_VLAN_EROVTL_MASK;
+	value |= (0x3 << XGMAC_VLAN_EROVTL_SHIFT);
+	value |= XGMAC_VLAN_EDVLP;
+	value |= XGMAC_VLAN_DOVLTC;
+
+	writel(value, ioaddr + XGMAC_VLAN_TAG);
+}
+
+static int dwxgmac2_add_hw_vlan_rx_fltr(struct net_device *dev,
+					struct mac_device_info *hw,
+					__be16 proto, u16 vid, uint8_t off,
+					bool is_last)
+{
+	/* enable external lookup perfect fileter */
+	return dwxgmac2_indiraccess_write(hw, vid, XGMAC_INDIR_EXT_ROVTL, 1);
+}
+
+static int dwxgmac2_del_hw_vlan_rx_fltr(struct net_device *dev,
+					struct mac_device_info *hw,
+					__be16 proto, u16 vid, uint8_t off,
+					bool is_last)
+{
+	int ret;
+
+	ret = dwxgmac2_indiraccess_write(hw, vid & 0xFFF, XGMAC_INDIR_EXT_ROVTL,
+				       0);
+	return ret;
+}
+
+static void dwxgmac2_config_vlan_rx_fltr(struct mac_device_info *hw,
+					 bool enable)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
+
+	if (enable)
+		value |= XGMAC_FILTER_VTFE;
+	else
+		value &= ~XGMAC_FILTER_VTFE;
+	writel(value, ioaddr + XGMAC_PACKET_FILTER);
+}
+
+static void dwxgmac2_rx_vlan_stripping_config(struct mac_device_info *hw,
+					      bool enable)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_VLAN_TAG);
+
+	if (enable) {
+		value |= (XGMAC_VLAN_TAG_CTRL_EVLRXS);
+		value &= ~(XGMAC_VLAN_TAG_CTRL_EVLS_MASK);
+		value |= XGMAC_VLAN_TAG_STRIP_PASS;
+	} else {
+		value &= ~XGMAC_VLAN_TAG_CTRL_EVLS_MASK;
+		value |= XGMAC_VLAN_TAG_STRIP_NONE;
+	}
+	writel(value, ioaddr + XGMAC_VLAN_TAG);
+}
+
+#define RSS_KEY_SIZE 10
+#define RSS_TABLE_SIZE 256
+#define EXT_DAHASH_SIZE 256
+#define GCL_MEM_SIZE 256
+static int dwxgmac2_rxf_and_acl_mem_reset(struct mac_device_info *hw)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	int filter_no = 0, i = 0;
+	int ret = 0;
+
+	/*rss reset */
+	writel(0, ioaddr + XGMAC_RSS_DATA);
+	writel(0, ioaddr + XGMAC_RSS_ADDR);
+	for (i = 0; i < RSS_KEY_SIZE; i++) {
+		ret = dwxgmac2_rss_write_reg(ioaddr, true, i, 0);
+		if (ret) {
+			netdev_err(hw->priv->dev, "%s: %d i %d poll_timeout.\n",
+				   __func__, __LINE__, i);
+			return ret;
+		}
+	}
+
+	for (i = 0; i < RSS_TABLE_SIZE; i++) {
+		ret = dwxgmac2_rss_write_reg(ioaddr, false, i, 0);
+		if (ret) {
+			netdev_err(hw->priv->dev, "%s: %d i %d poll_timeout.\n",
+				   __func__, __LINE__, i);
+			return ret;
+		}
+	}
+
+	/*filter */
+	/*clear operation busy */
+	writel(0, ioaddr + XGMAC_L3L4_ADDR_CTRL);
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3L4_CTRL, 0);
+		if (ret)
+			return ret;
+	}
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L4_ADDR, 0);
+		if (ret)
+			return ret;
+	}
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR0, 0);
+		if (ret)
+			return ret;
+	}
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR1, 0);
+		if (ret)
+			return ret;
+	}
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XGMAC_L3_ADDR2, 0);
+		if (ret)
+			return ret;
+	}
+	for (filter_no = 0; filter_no < hw->priv->dma_cap.l3l4fnum; filter_no++) {
+		ret = dwxgmac2_filter_write(hw, filter_no, XMGAC_L3_ADDR3, 0);
+		if (ret)
+			return ret;
+	}
+
+	 /**/ writel(0, ioaddr + XGMAC_RVLAN_LKP_SIZE);
+	for (i = 0; i < VLAN_N_VID; i++) {
+		ret =
+		    dwxgmac2_indiraccess_write(hw, i, XGMAC_INDIR_EXT_ROVTL, 0);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < VLAN_N_VID; i++) {
+		ret =
+		    dwxgmac2_indiraccess_write(hw, i, XGMAC_INDIR_EXT_RIVTL, 0);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < 0x1f; i++) {
+		ret =
+		    dwxgmac2_indiraccess_write(hw, i, XGMAC_INDIR_DCHSEL, 0);
+		if (ret)
+			return ret;
+	}
+	/*gcl */
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_BTR_LOW, 0, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_BTR_HIGH, 0, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_TER, 0, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_LLR, GCL_MEM_SIZE, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_CTR_LOW, 0, false);
+	ret |= dwxgmac3_est_write(ioaddr, XGMAC_CTR_HIGH, 0, false);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < GCL_MEM_SIZE; i++) {
+		ret = dwxgmac3_est_write(ioaddr, i, 0, true);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int dwxgmac3_rxp_disable(void __iomem *ioaddr)
+{
+	u32 val = readl(ioaddr + XGMAC_MTL_OPMODE);
+
+	val &= ~XGMAC_FRPE;
+	writel(val, ioaddr + XGMAC_MTL_OPMODE);
+
+	return 0;
+}
+
+static void dwxgmac3_rxp_enable(void __iomem *ioaddr)
+{
+	u32 val;
+
+	val = readl(ioaddr + XGMAC_MTL_OPMODE);
+	val |= XGMAC_FRPE;
+	writel(val, ioaddr + XGMAC_MTL_OPMODE);
+}
+
+static int dwxgmac3_rxp_update_single_entry(struct mac_device_info *hw,
+					    struct dn200_tc_entry *entry,
+					    int pos)
+{
+	int ret, i;
+	void __iomem *ioaddr = hw->pcsr;
+	int retry = 0;
+	u32 val;
+	int real_pos;
+
+	for (i = 0; i < (sizeof(entry->val) / sizeof(u32)); i++) {
+		real_pos = pos * (sizeof(entry->val) / sizeof(u32)) + i;
+		retry = 0;
+		/* Wait for ready */
+		ret =
+		    readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST, val,
+					      !(val & XGMAC_STARTBUSY), 1,
+					      10000);
+		if (ret) {
+			netdev_err(hw->priv->dev, "%s: %d poll_timeout\n",
+				   __func__, __LINE__);
+			return ret;
+		}
+retry:
+		/* Write data */
+		val = *((u32 *)&entry->val + i);
+		writel(val, ioaddr + XGMAC_MTL_RXP_IACC_DATA);
+
+		/* Write pos */
+		val = real_pos & XGMAC_ADDR;
+		writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+		/* Write OP */
+		val |= XGMAC_WRRDN;
+		writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+		/* Start Write */
+		val |= XGMAC_STARTBUSY;
+		writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+		/* Wait for done */
+		ret =
+		    readl_poll_timeout_atomic(ioaddr +
+					      XGMAC_MTL_RXP_IACC_CTRL_ST, val,
+					      !(val & XGMAC_STARTBUSY), 1,
+					      10000);
+		if (ret && !retry) {
+			retry = 1;
+			writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+			writel(0, ioaddr + XGMAC_MTL_RXP_IACC_DATA);
+			goto retry;
+		} else if (ret) {
+			writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+			netdev_err(hw->priv->dev, "%s: %d poll_timeout\n",
+				   __func__, __LINE__);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static int dwxgmac2_rxp_get_single_entry_sriov(struct mac_device_info *hw,
+					       u32 *data, int real_pos)
+{
+	int ret;
+	u32 val;
+	int retry = 0;
+	void __iomem *ioaddr = hw->pcsr;
+
+	/* Wait for ready */
+	ret = readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST,
+					val, !(val & XGMAC_STARTBUSY), 10,
+					10000);
+	if (ret) {
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+		return ret;
+	}
+retry:
+	/* Write pos */
+	val = real_pos & XGMAC_ADDR;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Write OP */
+	val &= ~XGMAC_WRRDN;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Start Write */
+	val |= XGMAC_STARTBUSY;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Wait for done */
+	ret = readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST,
+					val, !(val & XGMAC_STARTBUSY), 10,
+					100000);
+	if (ret && !ret) {
+		retry = 1;
+		writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+		goto retry;
+	} else if (ret) {
+		WARN_ON(1);
+		writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+		return ret;
+	}
+
+	/* Read data */
+	*data = readl(ioaddr + XGMAC_MTL_RXP_IACC_DATA);
+
+	return 0;
+}
+
+static int dwxgmac2_rxp_update_single_entry_sriov(struct mac_device_info *hw,
+						  u32 data, int real_pos)
+{
+	int ret;
+	u32 val;
+	void __iomem *ioaddr = hw->pcsr;
+	int retry = 0;
+
+	/* Wait for ready */
+	ret = readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST,
+					val, !(val & XGMAC_STARTBUSY), 10,
+					10000);
+	if (ret) {
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+		return ret;
+	}
+
+retry:
+	/* Write data */
+	writel(data, ioaddr + XGMAC_MTL_RXP_IACC_DATA);
+
+	/* Write pos */
+	val = real_pos & XGMAC_ADDR;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Write OP */
+	val |= XGMAC_WRRDN;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Start Write */
+	val |= XGMAC_STARTBUSY;
+	writel(val, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+
+	/* Wait for done */
+	ret = readl_poll_timeout_atomic(ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST,
+					val, !(val & XGMAC_STARTBUSY), 10,
+					100000);
+	if (ret && !retry) {
+		writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+		retry++;
+		goto retry;
+	} else if (ret) {
+		writel(0, ioaddr + XGMAC_MTL_RXP_IACC_CTRL_ST);
+		netdev_err(hw->priv->dev, "%s: %d poll_timeout\n", __func__,
+			   __LINE__);
+	}
+
+	return ret;
+}
+
+static void dwxgmac2_rxp_clear_entry_sriov(struct mac_device_info *hw)
+{
+	int i = 0;
+
+	for (; i < DN200_MAX_USED_RXP_NUM; i++) {
+		dwxgmac2_rxp_update_single_entry_sriov(hw, 0, i * 4 + 0);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, 0, i * 4 + 1);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, 0, i * 4 + 2);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, 0, i * 4 + 3);
+	}
+}
+
+static int dwxgmac2_rxp_update_single_da_entry_sriov(struct mac_device_info *hw,
+						     struct RXP_FPR_ENTRY
+						     *entry, int pos)
+{
+	int ret, i;
+	u32 val;
+	int real_pos;
+
+	/*delete rxp channel, clear dma channel firstly */
+	if (entry->dma_ch_no == 0) {
+		for (i = (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) - 1;
+		     i >= 0; i--) {
+			real_pos =
+			    pos * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) + i;
+
+			val = *((u32 *)entry + i);
+			ret =
+			    dwxgmac2_rxp_update_single_entry_sriov(hw, val,
+								   real_pos);
+			if (ret)
+				return ret;
+		}
+	} else {
+		for (i = 0; i < (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32));
+		     i++) {
+			real_pos =
+			    pos * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) +
+			    i;
+			val = *((u32 *)entry + i);
+			ret =
+			    dwxgmac2_rxp_update_single_entry_sriov(hw, val,
+								   real_pos);
+			if (ret)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+static int dwxgmac2_rxp_get_single_da_entry_sriov(struct mac_device_info *hw,
+						  struct RXP_FPR_ENTRY *entry,
+						  int pos)
+{
+	int ret, i;
+	u32 val;
+	int real_pos;
+
+	for (i = 0; i < (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)); i++) {
+		real_pos =
+		    pos * (sizeof(struct RXP_FPR_ENTRY) / sizeof(u32)) + i;
+		ret = dwxgmac2_rxp_get_single_entry_sriov(hw, &val, real_pos);
+		if (ret)
+			return ret;
+
+		*((u32 *)entry + i) = val;
+	}
+
+	return 0;
+}
+
+static int rxp_offset_get_from_bitmap(int bitmap_off)
+{
+	int rxp_off = 0;
+
+	if (bitmap_off < 1)
+		rxp_off = 0;
+	else if (bitmap_off < DN200_VF_UC_OFF)
+		rxp_off = (bitmap_off) * 2;
+	else
+		rxp_off =
+		    (bitmap_off - DN200_VF_UC_OFF) * 2 + 2 +
+		    DN200_PF_VLAN_ENTRY_NUM + 1 + (DN200_PF_SELF_UC_NUM +
+						   DN200_PF_OTHER_UC_NUM) * 2;
+	return rxp_off;
+}
+
+static int dn200_add_vf_uc_rxp_da_route_sriov(struct mac_device_info *hw,
+						 u8 *mac_addr, int offset, u8 rxq_start)
+{
+	unsigned long bitmap;
+	int prev_off;
+	int next_off;
+	u32 prev_off_val;
+	u32 data;
+	u32 channel = 0;
+	u8 entry_offset = 0;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	frp_entry_data = kcalloc(3, sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return -ENOMEM;
+
+	channel = (1 << rxq_start);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	/*get prev used entry */
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+	if (prev_off < 0)
+		prev_off = 0;
+
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+
+	if (prev_off && prev_off < DN200_VF_UC_OFF) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val + 2));
+
+		next_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+
+		next_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+	}
+	frp_entry_data[0].match_data = cpu_to_le32(*(u32 *)(mac_addr));
+	frp_entry_data[0].match_en = 0xffffffff;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].ok_index = next_off;
+	frp_entry_data[0].frame_offset = 0;
+
+	frp_entry_data[1].match_data = cpu_to_le32(*(u16 *)(mac_addr + 4));
+	frp_entry_data[1].match_en = 0xffff;
+	frp_entry_data[1].af = 1;
+	frp_entry_data[1].nc = 1;
+	frp_entry_data[1].frame_offset = 1;
+	frp_entry_data[1].ok_index = next_off;
+	frp_entry_data[1].dma_ch_no = channel;
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  entry_offset);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  entry_offset + 1);
+
+	if (prev_off && prev_off < DN200_VF_UC_OFF) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val + 2));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val + 2));
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET
+						       (prev_off_val));
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val));
+
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val));
+	}
+	bitmap_set((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+	kfree(frp_entry_data);
+	return offset;
+}
+
+/* uc mac addr add*/
+static int dn200_add_pf_uc_rxp_da_route_sriov(struct mac_device_info *hw,
+						 u8 *mac_addr, int offset)
+{
+	unsigned long bitmap;
+	int prev_off;
+	int next_off;
+	u32 prev_off_val;
+	u32 data;
+	u32 channel = 0;
+	u8 entry_offset = 0;
+	int next_uc_offset = 0;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return -EINVAL;
+	}
+
+	frp_entry_data = kcalloc(3, sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return -ENOMEM;
+
+	channel = (1 << DN200_RXQ_START_GET(hw));
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	if (!offset) {
+		offset =
+		    dn200_get_unused_bit(&bitmap, DN200_PF_UC_ADDR_END,
+					 DN200_PF_UC_ADDR_START);
+		if (offset < 0) {
+			netdev_dbg(hw->priv->dev, "%s:uc unused bit smaller zero\n", __func__);
+			return -EBUSY;
+		}
+
+		/*Skip this option if it has already been configured */
+		if (bitmap & (1 << offset)) {
+			netdev_dbg(hw->priv->dev, "%s: uc has already been configured\n", __func__);
+			kfree(frp_entry_data);
+			return offset;
+		}
+	}
+
+	/*get prev used entry */
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+	if (prev_off < 0)
+		prev_off = 0;
+
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+
+	/* from the pre entry,we can get the entry that pre entry shot */
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					    (prev_off_val));
+	next_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+	frp_entry_data[0].match_data = cpu_to_le32(*(u32 *)(mac_addr));
+	frp_entry_data[0].match_en = 0xffffffff;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].ok_index = next_off;
+	frp_entry_data[0].frame_offset = 0;
+
+	frp_entry_data[1].match_data = cpu_to_le32(*(u16 *)(mac_addr + 4));
+	frp_entry_data[1].match_en = 0xffff;
+	frp_entry_data[1].af = 1;
+	frp_entry_data[1].rf = 1;
+	frp_entry_data[1].nc = 0;
+	frp_entry_data[1].frame_offset = 1;
+	frp_entry_data[1].ok_index = (DN200_VLAN_ADDR_START);
+	frp_entry_data[1].dma_ch_no = channel;
+
+	/*when match ,need route the vlan entry */
+	next_uc_offset = dn200_get_next_used_bit(&bitmap, offset, DN200_VF_UC_OFF);
+
+	/*last uc*/
+	if (next_uc_offset < 0) {
+		if (prev_off < 1) {
+			dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+							DA_OK_INDEX_FIRST_ENTRY_OFFSET(prev_off));
+			frp_entry_data[2].match_data = 0;
+			frp_entry_data[2].match_en = 0;
+			frp_entry_data[2].af = 0;
+			frp_entry_data[2].nc = 0;
+			frp_entry_data[2].ok_index = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+			frp_entry_data[2].dma_ch_no = channel;
+			dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data
+								  [2],
+								  entry_offset +
+								  2);
+		} else {
+			dwxgmac2_rxp_get_single_da_entry_sriov(hw,
+							       &frp_entry_data[2],
+							       prev_off_val + 2);
+			dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data[2],
+								  entry_offset + 2);
+		}
+	}
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  entry_offset);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  entry_offset + 1);
+
+	if (next_uc_offset < 0) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (entry_offset));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset + 2) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (entry_offset));
+	}
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					    (prev_off_val));
+	data &= ~(OK_INDEX_MASK);
+	data |= ((entry_offset) << OK_INDEX_OFFSET);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					       (prev_off_val));
+	if (!prev_off_val) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					DA_OK_INDEX_SEC_ENTRY_OFFSET
+					(prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					DA_OK_INDEX_SEC_ENTRY_OFFSET
+					(prev_off_val));
+	}
+
+	bitmap_set((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+	kfree(frp_entry_data);
+	return offset;
+}
+
+/*from lram ,we can get pf's ohter uc mac addr(not contain self) */
+static int dn200_pf_lram_uc_add_rxp(struct mac_device_info *hw,
+				       u8 *mac_addr)
+{
+	unsigned long bitmap_uc;
+	unsigned long bitmap_pm;
+	int j = 0, found = 0, unused_bit = 0;
+	struct mac_addr_route *addr_route =
+	    kmalloc(sizeof(struct mac_addr_route), GFP_ATOMIC);
+
+	if (!addr_route)
+		return -ENOMEM;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_uc, &bitmap_uc);
+	for (j = 0; j < DN200_MAX_UC_MAC_ADDR_NUM; j++) {
+		if (!(bitmap_uc & (1 << j)))
+			continue;
+
+		dn200_get_func_uc_mac_addr(hw, j, addr_route);
+		if (memcmp(addr_route->mac_addr, mac_addr, ETH_ALEN) == 0) {
+			found = 1;
+			break;
+		}
+	}
+	/*if found, we do nothing */
+	if (!found) {
+		addr_route->rxp_offset =
+		    dn200_add_pf_uc_rxp_da_route_sriov(hw, mac_addr, 0);
+		if (addr_route->rxp_offset <= 0) {
+			kfree(addr_route);
+			return 0;
+		}
+		addr_route->channel = (1 << DN200_RXQ_START_GET(hw));
+		memcpy(addr_route->mac_addr, mac_addr, ETH_ALEN);
+		/*supoort 16 pf uc mac addr ,but first can not change here */
+		unused_bit =
+		    dn200_get_unused_bit(&bitmap_uc, DN200_MAX_UC_MAC_ADDR_NUM,
+					 0);
+		if (unused_bit < 0) {	/*if full, we should set promisc */
+			DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc,
+						      &bitmap_pm);
+			bitmap_pm |= DN200_RXQ_START_GET(hw);
+			DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc,
+						      bitmap_pm);
+			kfree(addr_route);
+			return 0;
+		}
+		dn200_set_func_uc_mac_addr(hw, unused_bit, addr_route);
+		bitmap_set((unsigned long *)&bitmap_uc, unused_bit, 1);
+		DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_uc, bitmap_uc);
+	}
+
+	kfree(addr_route);
+	return 0;
+}
+
+static int dn200_del_vf_uc_rxp_da_route(struct mac_device_info *hw,
+					    int offset)
+{
+	int prev_off;
+	int next_off;
+	u32 data;
+	u32 prev_off_val;
+	u8 entry_offset;
+	unsigned long bitmap;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return  -EINVAL;
+	}
+
+	frp_entry_data =
+	    kcalloc(3, sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return -ENOMEM;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_SEC_ENTRY_OFFSET(entry_offset));
+	if (!data)
+		goto free_mem;
+
+	next_off = ((data & OK_INDEX_MASK) >> OK_INDEX_OFFSET);
+
+	if (!next_off)
+		goto free_mem;
+
+	if (prev_off && prev_off < DN200_VF_UC_OFF) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val + 2));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val + 2));
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val));
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET
+						       (prev_off_val));
+	}
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  entry_offset);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  entry_offset + 1);
+
+	bitmap_clear((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+free_mem:
+	kfree(frp_entry_data);
+	return 0;
+}
+
+/*rxp uc delete and laram delete*/
+static int dn200_del_pf_uc_rxp_da_route(struct mac_device_info *hw,
+					    int offset)
+{
+	int prev_off;
+	int next_off;
+	u32 data;
+	u32 prev_off_val;
+	int entry_offset;
+	unsigned long bitmap;
+	u8 tmp_off;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return -EINVAL;
+	}
+
+	frp_entry_data = kcalloc(3, sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return -ENOMEM;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+
+	next_off = dn200_get_next_used_bit(&bitmap, offset, DN200_VF_UC_OFF);
+	if (next_off < 0) {	/*uc last */
+		dwxgmac2_rxp_get_single_da_entry_sriov(hw, &frp_entry_data[0],
+						       entry_offset + 2);
+
+		/*delete 3's entry rxp*/
+		dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data[1],
+								  entry_offset);
+		dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data[1],
+								  entry_offset + 1);
+		dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data[1],
+								  entry_offset + 2);
+		/*first uc*/
+		if (!prev_off) {
+			/*prev first ok_index*/
+			dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET(prev_off_val));
+			data &= ~(OK_INDEX_MASK);
+			data |= (frp_entry_data[0].ok_index << OK_INDEX_OFFSET);
+			dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					 DA_OK_INDEX_FIRST_ENTRY_OFFSET(prev_off_val));
+			/*prev second ok_index*/
+			dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET(prev_off_val));
+			data &= ~(OK_INDEX_MASK);
+			data |= (frp_entry_data[0].ok_index << OK_INDEX_OFFSET);
+			dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET(prev_off_val));
+		} else {
+			dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+								  &frp_entry_data[0],
+								  entry_offset);
+		}
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (entry_offset));
+		tmp_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= (tmp_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val));
+		dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+							  &frp_entry_data[1],
+							  entry_offset);
+		dwxgmac2_rxp_update_single_da_entry_sriov(hw,
+							  &frp_entry_data[1],
+							  entry_offset + 1);
+		if (!prev_off) {
+			dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+			data &= ~(OK_INDEX_MASK);
+			data |= (tmp_off << OK_INDEX_OFFSET);
+			dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+							DA_OK_INDEX_SEC_ENTRY_OFFSET
+							(prev_off_val));
+		}
+	}
+
+	bitmap_clear((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+	kfree(frp_entry_data);
+	return 0;
+}
+
+static void dn200_clear_lram_pf_uc_rxp(struct mac_device_info *hw)
+{
+	unsigned long bitmap_uc;
+	u64 j;
+	struct mac_addr_route *addr_route =
+	    kmalloc(sizeof(struct mac_addr_route), GFP_ATOMIC);
+
+	if (!addr_route) {
+		netdev_err(hw->priv->dev, "%s Alloc addr_route memory failed\n",
+			   __func__);
+		return;
+	}
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_uc, &bitmap_uc);
+	for (j = 0; j < DN200_MAX_UC_MAC_ADDR_NUM; j++) {
+		if (!(bitmap_uc & (1ULL << j)))
+			continue;
+
+		dn200_get_func_uc_mac_addr(hw, j, addr_route);
+		dn200_del_pf_uc_rxp_da_route(hw, addr_route->rxp_offset);
+		memset(addr_route, 0, sizeof(struct mac_addr_route));
+		dn200_set_func_uc_mac_addr(hw, j, addr_route);
+		bitmap_clear((unsigned long *)&bitmap_uc, j, 1);
+	}
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_uc, bitmap_uc);
+	kfree(addr_route);
+}
+
+/*multi addr*/
+/*clear/add mc rxp channel*/
+static void dn200_mc_rxp_channel_route_set(struct mac_device_info *hw,
+					      bool enable, u16 bitmap_promisc)
+{
+	int entry_offset;
+	u32 data;
+	u8 next_off;
+
+	if (!bitmap_promisc)
+		return;
+
+	entry_offset = rxp_offset_get_from_bitmap(DN200_MC_ADDR_START);
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_SEC_ENTRY_OFFSET
+					    (entry_offset));
+	next_off = ((data & OK_INDEX_MASK) >> OK_INDEX_OFFSET);
+	if (!next_off)
+		return;
+
+	/*get channel */
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_DMA_CHA_NO_OFFSET(entry_offset));
+	if (enable)
+		data |= bitmap_promisc;
+	else
+		data &= ~bitmap_promisc;
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET
+					       (entry_offset));
+	while (next_off < DN200_ALL_MULTCAST_OFF) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_DMA_CHA_NO_OFFSET
+						    (next_off));
+		if (enable)
+			data |= bitmap_promisc;
+		else
+			data &= ~bitmap_promisc;
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_DMA_CHA_NO_OFFSET
+						       (next_off));
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_DMA_CHA_NO_OFFSET
+						    (next_off));
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (next_off));
+		next_off = ((data & OK_INDEX_MASK) >> OK_INDEX_OFFSET);
+		if (!next_off)
+			return;
+	}
+}
+
+static void dn200_del_mc_rxp_da_route(struct mac_device_info *hw, int offset)
+{
+	unsigned long bitmap;
+	u8 prev_off;
+	u32 prev_off_val;
+	u8 next_off;
+	u32 data;
+	u8 entry_offset;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return;
+	}
+
+	frp_entry_data = kcalloc(2, sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					    (entry_offset));
+	next_off = ((data & OK_INDEX_MASK) >> OK_INDEX_OFFSET);
+	if (!next_off)
+		goto free_mem;
+
+	if (prev_off && (prev_off < (DN200_VF_UC_OFF))) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val + 2));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val + 2));
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val));
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= (next_off << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET
+						       (prev_off_val));
+	}
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  entry_offset);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  entry_offset + 1);
+
+	bitmap_clear((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+free_mem:
+	kfree(frp_entry_data);
+}
+
+static void dn200_update_bcmc_channel(struct mac_device_info *hw,
+					int rxp_offset, bool add, u8 rxq_start)
+{
+	u32 data;
+
+	if (rxp_offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return;
+	}
+	/* Disable RX Parser */
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data, (u8)rxp_offset);
+	if (add)
+		data |= (1 << rxq_start);
+	else
+		data &= ~(1 << rxq_start);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data, (u8)rxp_offset);
+}
+
+static void dn200_clear_mc_da_route(struct mac_device_info *hw, u8 rxq_start)
+{
+	unsigned long bitmap_mac;
+	u64 j;
+	struct mac_addr_route *addr_route =
+	    kmalloc(sizeof(struct mac_addr_route), GFP_ATOMIC);
+
+	if (!addr_route) {
+		netdev_err(hw->priv->dev, "%s Alloc addr_route memory failed\n",
+			   __func__);
+		return;
+	}
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_mac, &bitmap_mac);
+	for (j = 0; j < DN200_MAX_MC_ADDR_NUM; j++) {
+		if (!(bitmap_mac & (1ULL << j)))
+			continue;
+
+		dn200_get_func_mac_addr(hw, j, addr_route);
+		if (addr_route->rxp_offset <= 0) {
+			memset(addr_route, 0,
+				       sizeof(struct mac_addr_route));
+			bitmap_clear((unsigned long *)&bitmap_mac, j,
+					     1);
+			dn200_set_func_mac_addr(hw, j, addr_route);
+			goto free_route;
+		}
+		if (addr_route->channel & (1 << rxq_start)) {	/*some func owns this mc */
+			addr_route->channel &= ~(1 << rxq_start);
+			if (addr_route->channel == 0) {	/*only one func owned this mac */
+				dn200_del_mc_rxp_da_route(hw,
+							     addr_route->rxp_offset);
+				memset(addr_route, 0,
+				       sizeof(struct mac_addr_route));
+				bitmap_clear((unsigned long *)&bitmap_mac, j,
+					     1);
+			} else {	/*update channel */
+				dn200_update_bcmc_channel(hw,
+						 addr_route->rxp_offset, false, rxq_start);
+			}
+			dn200_set_func_mac_addr(hw, j, addr_route);
+		}
+	}
+
+free_route:
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_mac, bitmap_mac);
+	kfree(addr_route);
+}
+
+static void dwxgmac2_append_rxp_da_route_sriov(struct mac_device_info *hw,
+					       int offset, u16 channel)
+{
+	u32 data;
+	u32 entry_offset = 0;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return;
+	}
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_DMA_CHA_NO_OFFSET(entry_offset));
+
+	data |= (1 << channel);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET
+					       (entry_offset));
+}
+
+static int dwxgmac2_add_mc_rxp_da_route_sriov(struct mac_device_info *hw,
+					      u8 *mac_addr, int offset, u8 rxq_start)
+{
+	unsigned long bitmap;
+	int prev_off;
+	int prev_off_val;
+	int next_off;
+	u32 data;
+	u32 channel = 0;
+	u8 entry_offset = 0;
+	struct RXP_FPR_ENTRY *frp_entry_data;
+
+	if (offset < 0) {
+		netdev_err(hw->priv->dev, "%s: parameter offset less than zero\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!netif_running(hw->priv->dev))
+		return -EBUSY;
+
+	frp_entry_data = kmalloc(2 * sizeof(struct RXP_FPR_ENTRY), GFP_ATOMIC);
+	if (!frp_entry_data)
+		return -ENOMEM;
+
+	channel = (1 << rxq_start);
+	memset(&frp_entry_data[0], 0, sizeof(struct RXP_FPR_ENTRY));
+	memset(&frp_entry_data[1], 0, sizeof(struct RXP_FPR_ENTRY));
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	if (!offset) {
+		offset =
+		    dn200_get_unused_bit(&bitmap, DN200_MC_ADDR_END,
+					 DN200_MC_ADDR_START);
+		if (offset < 0) {
+			netdev_err(hw->priv->dev, "%s: mc unused bit smaller zero\n", __func__);
+			kfree(frp_entry_data);
+			return -EBUSY;
+		}
+	}
+	/*Skip this option if it has already been configured */
+	if (bitmap & (1ULL << offset)) {
+		netdev_dbg(hw->priv->dev, "%s: mc has already been configured\n", __func__);
+		kfree(frp_entry_data);
+		return offset;
+	}
+	entry_offset = rxp_offset_get_from_bitmap(offset);
+
+	prev_off = dn200_get_prev_used_bit(&bitmap, offset);
+
+	prev_off_val = rxp_offset_get_from_bitmap(prev_off);
+
+	if (prev_off && prev_off < DN200_VF_UC_OFF) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val + 2));
+
+		next_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val + 2));
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (prev_off_val));
+
+		next_off = (data & OK_INDEX_MASK) >> OK_INDEX_OFFSET;
+
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET
+						       (prev_off_val));
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (prev_off_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (prev_off_val));
+	}
+
+	frp_entry_data[0].match_data = cpu_to_le32(*(u32 *)(mac_addr));
+	frp_entry_data[0].match_en = 0xffffffff;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].ok_index = next_off;
+	frp_entry_data[0].frame_offset = 0;
+
+	frp_entry_data[1].match_data = cpu_to_le32(*(u16 *)(mac_addr + 4));
+	frp_entry_data[1].match_en = 0xffff;
+	frp_entry_data[1].nc = 1;
+	frp_entry_data[1].frame_offset = 1;
+	frp_entry_data[1].dma_ch_no = channel;
+	frp_entry_data[1].ok_index = next_off;
+	frp_entry_data[1].af = 1;
+
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0],
+						  entry_offset);
+	dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1],
+						  entry_offset + 1);
+
+	bitmap_set((unsigned long *)&bitmap, offset, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+	kfree(frp_entry_data);
+	return offset;
+}
+
+static int dn200_mc_add_rxp(struct mac_device_info *hw, u8 *mac_addr, u8 rxq_start)
+{
+	unsigned long bitmap_mac;
+	unsigned long bitmap_allmucast;
+	int unused_bit = -1;
+	int j;
+	int found = 0;
+	struct mac_addr_route *addr_route =
+	    kmalloc(sizeof(struct mac_addr_route), GFP_ATOMIC);
+
+	if (!addr_route) {
+		netdev_err(hw->priv->dev, "%s Alloc addr_route memory failed\n",
+			   __func__);
+		return -ENOMEM;
+	}
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_mac, &bitmap_mac);
+	for (j = 0; j < DN200_MAX_MC_ADDR_NUM; j++) {
+		if (!(bitmap_mac & (1ULL << j)))
+			continue;
+		dn200_get_func_mac_addr(hw, j, addr_route);
+		if (memcmp(addr_route->mac_addr, mac_addr, ETH_ALEN) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		if (addr_route->rxp_offset <= 0)
+			goto free_route;
+
+		if (addr_route->rxp_offset != 0) {
+			if ((1 << rxq_start) ^ addr_route->channel) {
+				addr_route->channel |=
+				    (1 << rxq_start);
+				dwxgmac2_append_rxp_da_route_sriov(hw,
+								   addr_route->rxp_offset,
+								   rxq_start);
+			}
+		}
+		dn200_set_func_mac_addr(hw, j, addr_route);
+	} else {
+		unused_bit =
+		    dn200_get_unused_bit(&bitmap_mac,
+					 (DN200_MAX_MC_ADDR_NUM - 1), 0);
+		if (unused_bit < 0) {
+			/*set allmultist */
+			DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast,
+						      &bitmap_allmucast);
+			bitmap_allmucast |= rxq_start;
+			DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast,
+						      bitmap_allmucast);
+
+			kfree(addr_route);
+			return -ENOMEM;
+		}
+		addr_route->rxp_offset =
+		    dwxgmac2_add_mc_rxp_da_route_sriov(hw, mac_addr, 0, rxq_start);
+		if (addr_route->rxp_offset < 0)
+			goto free_route;
+		addr_route->channel = (1 << rxq_start);
+		memcpy(addr_route->mac_addr, mac_addr, ETH_ALEN);
+		dn200_set_func_mac_addr(hw, unused_bit, addr_route);
+		bitmap_set((unsigned long *)&bitmap_mac, unused_bit, 1);
+		DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_mac, bitmap_mac);
+	}
+free_route:
+	kfree(addr_route);
+	return 0;
+}
+
+static int dn200_get_used_bit_from_last(unsigned long *bitmap, u8 last,
+					u8 first)
+{
+	int i = last - 1;
+
+	for (; i >= first; i--) {
+		if ((bitmap[0] & (1ULL << i)))
+			return i;
+	}
+	return -1;
+}
+
+/* update the last
+ * update promsic
+ * update allmultist
+ */
+static void dn200_update_ampm_rxp(struct mac_device_info *hw, u8 rxq_start)
+{
+	unsigned long bitmap;
+	unsigned long bitmap_allmucast;
+	unsigned long bitmap_promisc;
+	u32 offset = 0;
+	u32 offset_val = 0;
+	u32 data;
+	u32 entry_offset = 0;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	offset =
+	    dn200_get_used_bit_from_last(&bitmap, (DN200_MC_ADDR_END + 1), 0);
+	if (offset < 0)
+		return;
+
+	offset_val = rxp_offset_get_from_bitmap(offset);
+
+	/*according to the bitmap,we can judge the direction */
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, &bitmap_allmucast);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, &bitmap_promisc);
+	if (bitmap_allmucast)
+		entry_offset = DN200_ALL_MULTCAST_OFF;
+	else if (bitmap_promisc)
+		entry_offset = DN200_ALL_PROMISC_OFF;
+	else
+		entry_offset = DN200_ALL_DROP_OFF;
+	if (offset && (offset < (1 + 1 + 15))) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (offset_val + 2));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (offset_val + 2));
+	} else {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						    (offset_val));
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+						       (offset_val));
+
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DA_OK_INDEX_SEC_ENTRY_OFFSET
+						    (offset_val));
+
+		data &= ~(OK_INDEX_MASK);
+		data |= ((entry_offset) << OK_INDEX_OFFSET);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DA_OK_INDEX_SEC_ENTRY_OFFSET
+						       (offset_val));
+	}
+
+	/*judge only PF open and update ampm */
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					    (DN200_ALL_MULTCAST_OFF));
+	if (bitmap_allmucast) {
+		if (bitmap_promisc) {
+			data &= ~(OK_INDEX_MASK);
+			data |= ((DN200_ALL_PROMISC_OFF) << OK_INDEX_OFFSET);
+		} else {
+			data &= ~(OK_INDEX_MASK);
+			data |= ((DN200_ALL_DROP_OFF) << OK_INDEX_OFFSET);
+		}
+	}
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_OK_INDEX_FIRST_ENTRY_OFFSET
+					       (DN200_ALL_MULTCAST_OFF));
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_DMA_CHA_NO_OFFSET_AMPM
+					    (DN200_ALL_MULTCAST_OFF));
+	if (bitmap_allmucast & (1 << rxq_start))
+		data |= (1 << rxq_start);
+	else
+		data &= ~(1 << rxq_start);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET_AMPM
+					       (DN200_ALL_MULTCAST_OFF));
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    DA_DMA_CHA_NO_OFFSET_AMPM
+					    (DN200_ALL_PROMISC_OFF));
+	if (bitmap_promisc & (1 << rxq_start))
+		data |= (1 << rxq_start);
+	else
+		data &= ~(1 << rxq_start);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET_AMPM
+					       (DN200_ALL_PROMISC_OFF));
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    AFRFNC_ENTRY_OFFSET
+					    (DN200_ALL_MULTCAST_OFF));
+	data &= (~RF_ENABLE);
+	data |= AF_ENABLE;
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       AFRFNC_ENTRY_OFFSET
+					       (DN200_ALL_MULTCAST_OFF));
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+					    AFRFNC_ENTRY_OFFSET
+					    (DN200_ALL_PROMISC_OFF));
+	if (bitmap_promisc & BIT(0)) {	/*PF open */
+		if (bitmap_promisc & GENMASK(15, 8)) {
+			data &= (~RF_ENABLE);
+			data |= AF_ENABLE;
+		} else {	/* only pf open */
+			data &= (~RF_ENABLE);
+			data &= (~AF_ENABLE);
+		}
+	} else if (bitmap_promisc) {
+		data &= (~RF_ENABLE);
+		data |= AF_ENABLE;
+	}
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       AFRFNC_ENTRY_OFFSET
+					       (DN200_ALL_PROMISC_OFF));
+}
+
+static void dn200_clear_allmu_promisc_da_route(struct mac_device_info *hw,
+						 int offset, u8 rxq_start)
+{
+	unsigned long bitmap_am;
+	unsigned long bitmap_pm;
+	u32 data;
+
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, &bitmap_am);
+	DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, &bitmap_pm);
+	/*clear allmuticast */
+	if (bitmap_am && (bitmap_am & (1 << rxq_start))) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DN200_ALL_MULTCAST_OFF);
+		data &= ~(1 << offset);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DN200_ALL_MULTCAST_OFF);
+		bitmap_clear((unsigned long *)&bitmap_am,
+			     rxq_start, 1);
+	}
+
+	if (bitmap_pm && (bitmap_pm & (1 << rxq_start))) {
+		dwxgmac2_rxp_get_single_entry_sriov(hw, &data,
+						    DN200_ALL_PROMISC_OFF);
+		data &= ~(1 << offset);
+		dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+						       DN200_ALL_PROMISC_OFF);
+		bitmap_clear((unsigned long *)&bitmap_pm,
+			     rxq_start, 1);
+	}
+}
+
+static int dwxgmac_config_rxp_broadcast_sriov(struct mac_device_info *hw)
+{
+	struct RXP_FPR_ENTRY frp_entry_data[5];
+	u32 channel = 0;
+	u32 val;
+	unsigned long bitmap = 0;
+	unsigned long bitmap_ampm = 0;
+	int ret = 0;
+
+	channel = (1 << 0);
+	memset(&frp_entry_data[0], 0, sizeof(struct RXP_FPR_ENTRY));
+	memset(&frp_entry_data[1], 0, sizeof(struct RXP_FPR_ENTRY));
+	memset(&frp_entry_data[2], 0, sizeof(struct RXP_FPR_ENTRY));
+	memset(&frp_entry_data[3], 0, sizeof(struct RXP_FPR_ENTRY));
+	memset(&frp_entry_data[4], 0, sizeof(struct RXP_FPR_ENTRY));
+
+	frp_entry_data[0].match_data = 0xffffffff;
+	frp_entry_data[0].match_en = 0xffffffff;
+	frp_entry_data[0].nc = 1;
+	frp_entry_data[0].ok_index = DN200_ALL_DROP_OFF;
+	frp_entry_data[0].frame_offset = 0;
+	frp_entry_data[0].dma_ch_no = channel;
+
+	frp_entry_data[1].match_data = 0xffff;
+	frp_entry_data[1].match_en = 0xffff;
+	frp_entry_data[1].nc = 0;
+	frp_entry_data[1].frame_offset = 1;
+	frp_entry_data[1].dma_ch_no = channel;
+	frp_entry_data[1].ok_index = DN200_ALL_DROP_OFF;
+	frp_entry_data[1].af = 1;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[0], 0);
+	if (ret)
+		goto rxp_updt_err;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[1], 1);
+	if (ret)
+		goto rxp_updt_err;
+
+	frp_entry_data[2].af = 1;
+	frp_entry_data[2].rf = 1;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[2],
+						  DN200_BC_RXP_OFF);
+	if (ret)
+		goto rxp_updt_err;
+
+	frp_entry_data[2].af = 0;
+	frp_entry_data[2].rf = 1;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[2],
+						  DN200_ALL_DROP_OFF);
+	if (ret)
+		goto rxp_updt_err;
+
+	/*allmulcast */
+	frp_entry_data[3].match_data = 0x1;
+	frp_entry_data[3].match_en = 0x1;
+	frp_entry_data[3].nc = 1;
+	frp_entry_data[3].ok_index = DN200_ALL_DROP_OFF;
+	frp_entry_data[3].frame_offset = 0;
+	frp_entry_data[3].dma_ch_no = 0;
+	frp_entry_data[3].af = 1;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[3],
+						  DN200_ALL_MULTCAST_OFF);
+	if (ret)
+		goto rxp_updt_err;
+
+	/*promsic */
+	frp_entry_data[4].nc = 0;
+	frp_entry_data[4].ok_index = DN200_BC_RXP_OFF;
+	frp_entry_data[4].frame_offset = 0;
+	frp_entry_data[4].dma_ch_no = 0;
+	frp_entry_data[4].af = 1;
+	frp_entry_data[4].rf = 0;
+	ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data[4],
+						  DN200_ALL_PROMISC_OFF);
+	if (ret)
+		goto rxp_updt_err;
+
+	val = (DN200_MAX_USED_RXP_NUM << 16) & XGMAC_NPE;
+	val |= DN200_MAX_USED_RXP_NUM & XGMAC_NVE;
+	writel(val, hw->pcsr + XGMAC_MTL_RXP_CONTROL_STATUS);
+	// DN200_GET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, &bitmap);
+	bitmap_set((unsigned long *)&bitmap, 0, 1);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_rxp, bitmap);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_uc, 0);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_allmucast, bitmap_ampm);
+	DN200_SET_LRAM_MAILBOX_MEMBER(hw, bitmap_promisc, bitmap_ampm);
+	dwxgmac3_rxp_enable(hw->pcsr);
+rxp_updt_err:
+	return ret;
+}
+
+/*append broadcast dma_channel*/
+static void dwxgmac2_vf_append_rxp_bc_sriov(struct mac_device_info *hw,
+					    u16 channel)
+{
+	u32 data;
+
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data, DA_DMA_CHA_NO_OFFSET(0));
+
+	data |= (1 << channel);
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET(0));
+}
+
+static int dwxgmac_reset_rxp(struct mac_device_info *hw)
+{
+	int i = 0;
+	int ret = 0;
+	struct RXP_FPR_ENTRY frp_entry_data = { 0 };
+
+	for (; i < 128; i++) {
+		ret = dwxgmac2_rxp_update_single_da_entry_sriov(hw, &frp_entry_data, i);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static void dwxgmac_get_rxp_filter_sriov(struct mac_device_info *hw,
+					 struct seq_file *seq)
+{
+	int i = 0;
+	struct RXP_FPR_ENTRY frp_entry_data = { 0 };
+	u32 *data;
+
+	for (; i < DN200_MAX_USED_RXP_NUM; i++) {
+		data = (u32 *)&frp_entry_data;
+		dwxgmac2_rxp_get_single_entry_sriov(hw, data, i * 4);
+		data = ((u32 *)&frp_entry_data) + 1;
+		dwxgmac2_rxp_get_single_entry_sriov(hw, data, i * 4 + 1);
+		data = ((u32 *)&frp_entry_data) + 2;
+		dwxgmac2_rxp_get_single_entry_sriov(hw, data, i * 4 + 2);
+		data = ((u32 *)&frp_entry_data) + 3;
+		dwxgmac2_rxp_get_single_entry_sriov(hw, data, i * 4 + 3);
+		seq_puts(seq, "==============\n");
+		seq_printf(seq, "RXP ENTRY %d\n", i);
+		seq_printf(seq, "%16s: %08x\n", "match_data",
+			   frp_entry_data.match_data);
+		seq_printf(seq, "%16s: %08x\n", "match_en",
+			   frp_entry_data.match_en);
+		seq_printf(seq, "%16s: %8x\n", "af", frp_entry_data.af);
+		seq_printf(seq, "%16s: %8x\n", "rf", frp_entry_data.rf);
+		seq_printf(seq, "%16s: %8x\n", "im", frp_entry_data.im);
+		seq_printf(seq, "%16s: %8x\n", "frame_offset",
+			   frp_entry_data.frame_offset);
+		seq_printf(seq, "%16s: %8x\n", "nc", frp_entry_data.nc);
+		seq_printf(seq, "%16s: %8d\n", "ok_index",
+			   frp_entry_data.ok_index);
+		seq_printf(seq, "%16s: %8x\n", "dma_ch_no",
+			   frp_entry_data.dma_ch_no);
+	}
+}
+
+static struct dn200_tc_entry *dwxgmac3_rxp_get_next_entry(struct dn200_tc_entry
+							  *entries,
+							  unsigned int count,
+							  u32 curr_prio)
+{
+	struct dn200_tc_entry *entry;
+	u32 min_prio = ~0x0;
+	int i, min_prio_idx;
+	bool found = false;
+
+	for (i = count - 1; i >= 0; i--) {
+		entry = &entries[i];
+
+		/* Do not update unused entries */
+		if (!entry->in_use)
+			continue;
+		/* Do not update already updated entries (i.e. fragments) */
+		if (entry->in_hw)
+			continue;
+		/* Let last entry be updated last */
+		if (entry->is_last)
+			continue;
+		/* Do not return fragments */
+		if (entry->is_frag)
+			continue;
+		/* Check if we already checked this prio */
+		if (entry->prio < curr_prio)
+			continue;
+		/* Check if this is the minimum prio */
+		if (entry->prio < min_prio) {
+			min_prio = entry->prio;
+			min_prio_idx = i;
+			found = true;
+		}
+	}
+
+	if (found)
+		return &entries[min_prio_idx];
+	return NULL;
+}
+
+static int dwxgmac3_rxp_config_sriov(struct mac_device_info *hw,
+				     struct dn200_tc_entry *entries,
+				     unsigned int count)
+{
+	int ret, nve = DN200_BC_RXP_OFF;
+	u32 old_val, val;
+	void __iomem *ioaddr = hw->pcsr;
+	/* Force disable RX */
+	old_val = readl(ioaddr + XGMAC_RX_CONFIG);
+	val = old_val & ~XGMAC_CONFIG_RE;
+	writel(val, ioaddr + XGMAC_RX_CONFIG);
+
+	/* Disable RX Parser */
+	ret = dwxgmac3_rxp_disable(ioaddr);
+	if (ret)
+		goto re_enable;
+
+	if (!nve)
+		goto re_enable;
+
+	/* Assume n. of parsable entries == n. of valid entries */
+	val = (nve << 16) & XGMAC_NPE;
+	val |= nve & XGMAC_NVE;
+	writel(val, ioaddr + XGMAC_MTL_RXP_CONTROL_STATUS);
+
+	/* Enable RX Parser */
+	dwxgmac3_rxp_enable(ioaddr);
+
+re_enable:
+	/* Re-enable RX */
+	writel(old_val, ioaddr + XGMAC_RX_CONFIG);
+	return ret;
+}
+
+static int dwxgmac3_rxp_config_purepf(struct mac_device_info *hw,
+				      struct dn200_tc_entry *entries,
+				      unsigned int count)
+{
+	struct dn200_tc_entry *entry, *frag;
+	int i, ret, nve = 0;
+	u32 curr_prio = 0;
+	u32 old_val, val;
+	void __iomem *ioaddr = hw->pcsr;
+	/* Force disable RX */
+	old_val = readl(ioaddr + XGMAC_RX_CONFIG);
+	val = old_val & ~XGMAC_CONFIG_RE;
+	writel(val, ioaddr + XGMAC_RX_CONFIG);
+
+	/* Disable RX Parser */
+	ret = dwxgmac3_rxp_disable(ioaddr);
+	if (ret)
+		goto re_enable;
+
+	/* Set all entries as NOT in HW */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		entry->in_hw = false;
+	}
+
+	/* Update entries by reverse order */
+	while (1) {
+		entry = dwxgmac3_rxp_get_next_entry(entries, count, curr_prio);
+		if (!entry)
+			break;
+
+		curr_prio = entry->prio;
+		frag = entry->frag_ptr;
+
+		/* Set special fragment requirements */
+		if (frag) {
+			entry->val.af = 0;
+			entry->val.rf = 0;
+			entry->val.nc = 1;
+			entry->val.ok_index = nve + 2;
+		}
+
+		ret = dwxgmac3_rxp_update_single_entry(hw, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+		entry->in_hw = true;
+
+		if (frag && !frag->in_hw) {
+			ret = dwxgmac3_rxp_update_single_entry(hw, frag, nve);
+			if (ret)
+				goto re_enable;
+			frag->table_pos = nve++;
+			frag->in_hw = true;
+		}
+	}
+
+	if (!nve)
+		goto re_enable;
+
+	/* Update all pass entry */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		if (!entry->is_last)
+			continue;
+
+		ret = dwxgmac3_rxp_update_single_entry(hw, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+	}
+
+	/* Assume n. of parsable entries == n. of valid entries */
+	val = (nve << 16) & XGMAC_NPE;
+	val |= nve & XGMAC_NVE;
+	writel(val, ioaddr + XGMAC_MTL_RXP_CONTROL_STATUS);
+
+	/* Enable RX Parser */
+	dwxgmac3_rxp_enable(ioaddr);
+
+re_enable:
+	/* Re-enable RX */
+	writel(old_val, ioaddr + XGMAC_RX_CONFIG);
+	return ret;
+}
+
+static void dwxgmac2_clear_vf_rxp_route(struct mac_device_info *hw, u8 vf_off)
+{
+	u32 data;
+	u8 vf_rx_start, off;
+
+	if (HW_IS_VF(hw) || !PRIV_SRIOV_SUPPORT(hw->priv))
+		return;
+
+	vf_rx_start = hw->priv->plat_ex->pf.vfs[vf_off].rx_queue_start;
+	/*unicast */
+	dn200_mc_rxp_channel_route_set(hw, false,
+					  (1 << vf_rx_start));
+
+	dn200_clear_mc_da_route(hw, vf_rx_start);	/*pf and vf */
+
+	/*broadcast */
+	dwxgmac2_rxp_get_single_entry_sriov(hw, &data, DA_DMA_CHA_NO_OFFSET(0));
+	data &= ~(1 << (vf_rx_start));
+	dwxgmac2_rxp_update_single_entry_sriov(hw, data,
+					       DA_DMA_CHA_NO_OFFSET(0));
+
+	dn200_clear_allmu_promisc_da_route(hw, vf_rx_start, vf_rx_start);	/*pf and vf */
+
+	/*delete rxp unicast mac addr */
+	if (HW_IS_VF(hw))
+		off = (vf_rx_start - 8) + DN200_VF_UC_OFF;
+	else
+		off = 1;
+	dn200_del_vf_uc_rxp_da_route(hw, off);
+}
+
+static void dwxgmac2_wq_vf_del_rxp_route(struct mac_device_info *hw, int offset, u8 rxq_start)
+{
+	u8 off;
+
+	/*broadcast */
+	dn200_update_bcmc_channel(hw, DA_DMA_CHA_NO_OFFSET(0), false, rxq_start);
+
+	/*delete rxp unicast mac addr */
+	off = (rxq_start - 8) + DN200_VF_UC_OFF;
+	dn200_del_vf_uc_rxp_da_route(hw, off);
+
+	dn200_mc_rxp_channel_route_set(hw, false,
+					  (1 << rxq_start));
+
+	/*delete rxp multi mac addr */
+	dn200_clear_mc_da_route(hw, rxq_start);	/*pf and vf */
+	/*pm am */
+	dn200_clear_allmu_promisc_da_route(hw, rxq_start, rxq_start);
+}
+
+static void dwxgmac2_vf_del_rxp_route(struct mac_device_info *hw)
+{
+	u8 wakeup_wq;
+
+	dwxgmac2_vf_set_async_info(hw, hw->priv->dev, &wakeup_wq, DN200_VF_CLEAR_RXP);
+}
+
+const struct dn200_ops dwxgmac_purepf_ops = {
+	.core_init = dwxgmac2_core_init,
+	.set_mac = dwxgmac2_set_mac,
+	.set_mac_rx = dwxgmac2_mac_rx_set,
+	.get_mac_rx = dwxgmac2_mac_rx_get,
+	.rx_ipc = dwxgmac2_rx_ipc,
+	.rx_queue_enable = dwxgmac2_rx_queue_enable,
+	.rx_queue_disable = dwxgmac2_rx_queue_disable,
+	.rx_dds_config = NULL,
+	.rx_queue_prio = dwxgmac2_rx_queue_prio,
+	.tx_queue_prio = dwxgmac2_tx_queue_prio,
+	.rx_queue_routing = NULL,
+	.prog_mtl_rx_algorithms = dwxgmac2_prog_mtl_rx_algorithms,
+	.prog_mtl_tx_algorithms = dwxgmac2_prog_mtl_tx_algorithms,
+	.set_mtl_tx_queue_weight = dwxgmac2_set_mtl_tx_queue_weight,
+	.set_mtl_rx_queue_weight = dwxgmac2_set_mtl_rx_queue_weight,
+	.map_mtl_to_dma = dwxgmac2_map_mtl_to_dma,
+	.mtl_dynamic_chan_set = dwxgmac2_mtl_dynamic_chan_set,
+	.config_cbs = dwxgmac2_config_cbs,
+	.dump_regs = dwxgmac2_dump_regs,
+	.host_irq_status = dwxgmac2_host_irq_status,
+	.host_mtl_irq_status = dwxgmac2_host_mtl_irq_status,
+	.flow_ctrl = dwxgmac2_flow_ctrl,
+	.set_umac_addr = dwxgmac2_set_umac_addr,
+	.get_umac_addr = dwxgmac2_get_umac_addr,
+	.set_eee_mode = dwxgmac2_set_eee_mode,
+	.reset_eee_mode = dwxgmac2_reset_eee_mode,
+	.set_eee_timer = dwxgmac2_set_eee_timer,
+	.set_eee_pls = dwxgmac2_set_eee_pls,
+	.pcs_ctrl_ane = NULL,
+	.pcs_rane = NULL,
+	.pcs_get_adv_lp = NULL,
+	.debug = NULL,
+	.set_filter = dwxgmac2_set_filter_purepf,
+	.safety_feat_config = dwxgmac3_safety_feat_config,
+	.safety_feat_irq_status = dwxgmac3_safety_feat_irq_status,
+	.safety_feat_dump = dwxgmac3_safety_feat_dump,
+	.set_mac_loopback = dwxgmac2_set_mac_loopback,
+	.rss_configure = dwxgmac2_rss_configure,
+	.update_vlan_hash = dwxgmac2_update_vlan_hash,
+	.rxp_config = dwxgmac3_rxp_config_purepf,
+	.get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp,
+	.flex_pps_config = dwxgmac2_flex_pps_config,
+	.sarc_configure = dwxgmac2_sarc_configure,
+	.enable_vlan = dwxgmac2_enable_vlan,
+	.config_l3_filter = dwxgmac2_config_l3_filter,
+	.config_l4_filter = dwxgmac2_config_l4_filter,
+	.config_ntuple_filter = dwxgmac2_config_ntuple_filter,
+	.l3_l4_filter_config = dwxgmac2_config_l3l4_filter,
+	.set_arp_offload = dwxgmac2_set_arp_offload,
+	.est_configure = dwxgmac3_est_configure,
+	.fpe_configure = dwxgmac3_fpe_configure,
+	.rxp_broadcast = NULL,
+	.rxp_filter_get = NULL,
+	.add_hw_vlan_rx_fltr = dwxgmac2_add_hw_vlan_rx_fltr,
+	.del_hw_vlan_rx_fltr = dwxgmac2_del_hw_vlan_rx_fltr,
+	.init_hw_vlan_rx_fltr = dwxgmac2_hw_vlan_init,
+	.config_vlan_rx_fltr = dwxgmac2_config_vlan_rx_fltr,
+	.rx_vlan_stripping_config = dwxgmac2_rx_vlan_stripping_config,
+	.tx_queue_flush = dwxgmac2_mtl_flush,
+	.reset_rxp = dwxgmac_reset_rxp,
+	.rxf_and_acl_mem_reset = dwxgmac2_rxf_and_acl_mem_reset,
+};
+
+const struct dn200_ops dwxgmac_sriov_ops = {
+	.core_init = dwxgmac2_core_init,
+	.set_mac = dwxgmac2_set_mac,
+	.set_mac_rx = dwxgmac2_mac_rx_set,
+	.get_mac_rx = dwxgmac2_mac_rx_get,
+	.rx_ipc = dwxgmac2_rx_ipc,
+	.rx_queue_enable = dwxgmac2_rx_queue_enable,
+	.rx_queue_disable = dwxgmac2_rx_queue_disable,
+	.rx_dds_config = dwxgmac2_rx_dds_config_sriov,
+	.rx_queue_prio = dwxgmac2_rx_queue_prio,
+	.tx_queue_prio = dwxgmac2_tx_queue_prio,
+	.rx_queue_routing = NULL,
+	.prog_mtl_rx_algorithms = dwxgmac2_prog_mtl_rx_algorithms,
+	.prog_mtl_tx_algorithms = dwxgmac2_prog_mtl_tx_algorithms,
+	.set_mtl_tx_queue_weight = dwxgmac2_set_mtl_tx_queue_weight,
+	.set_mtl_rx_queue_weight = dwxgmac2_set_mtl_rx_queue_weight,
+	.map_mtl_to_dma = dwxgmac2_map_mtl_to_dma,
+	.mtl_dynamic_chan_set = dwxgmac2_mtl_dynamic_chan_set,
+	.config_cbs = dwxgmac2_config_cbs,
+	.dump_regs = dwxgmac2_dump_regs,
+	.host_irq_status = dwxgmac2_host_irq_status,
+	.host_mtl_irq_status = dwxgmac2_host_mtl_irq_status,
+	.flow_ctrl = dwxgmac2_flow_ctrl,
+	.set_umac_addr = dwxgmac2_set_umac_addr,
+	.wq_set_umac_addr = dwxgmac2_wq_set_umac_addr,
+	.get_umac_addr = NULL,
+	.set_eee_mode = dwxgmac2_set_eee_mode,
+	.reset_eee_mode = dwxgmac2_reset_eee_mode,
+	.set_eee_timer = dwxgmac2_set_eee_timer,
+	.set_eee_pls = dwxgmac2_set_eee_pls,
+	.pcs_ctrl_ane = NULL,
+	.pcs_rane = NULL,
+	.pcs_get_adv_lp = NULL,
+	.debug = NULL,
+	.set_filter = dwxgmac2_set_filter_sriov,
+	.wq_set_filter = dwxgmac2_wq_set_filter,
+	.safety_feat_config = dwxgmac3_safety_feat_config,
+	.safety_feat_irq_status = dwxgmac3_safety_feat_irq_status,
+	.safety_feat_dump = dwxgmac3_safety_feat_dump,
+	.set_mac_loopback = dwxgmac2_set_mac_loopback,
+	.rss_configure = dwxgmac2_rss_configure,
+	.update_vlan_hash = dwxgmac2_update_vlan_hash,
+	.rxp_config = dwxgmac3_rxp_config_sriov,
+	.get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp,
+	.flex_pps_config = dwxgmac2_flex_pps_config,
+	.sarc_configure = dwxgmac2_sarc_configure,
+	.enable_vlan = dwxgmac2_enable_vlan,
+	.config_l3_filter = dwxgmac2_config_l3_filter,
+	.config_l4_filter = dwxgmac2_config_l4_filter,
+	.config_ntuple_filter = dwxgmac2_config_ntuple_filter,
+	.l3_l4_filter_config = dwxgmac2_config_l3l4_filter,
+	.set_arp_offload = dwxgmac2_set_arp_offload,
+	.est_configure = dwxgmac3_est_configure,
+	.fpe_configure = dwxgmac3_fpe_configure,
+	.rxp_broadcast = dwxgmac_config_rxp_broadcast_sriov,
+	.rxp_filter_get = dwxgmac_get_rxp_filter_sriov,
+	.rxp_clear = dwxgmac2_rxp_clear_entry_sriov,
+	.vf_del_rxp = dwxgmac2_vf_del_rxp_route,
+	.wq_vf_del_rxp = dwxgmac2_wq_vf_del_rxp_route,
+	.clear_vf_rxp = dwxgmac2_clear_vf_rxp_route,
+	.vf_append_rxp_bc = dwxgmac2_vf_append_rxp_bc_sriov,
+	.mtl_reset = dwxgmac2_mtl_reset,
+	.tx_queue_flush = dwxgmac2_mtl_flush,
+	.config_vlan_rx_fltr = dwxgmac2_rxp_vlan_filter_config,
+	.add_hw_vlan_rx_fltr = dwxgmac2_pf_add_rxp_vlan_route,
+	.init_hw_vlan_rx_fltr = dwxgmac2_sriov_init_rxp_vlan_route,
+	.del_hw_vlan_rx_fltr = dwxgmac2_pf_del_rxp_vlan_route,
+	.rx_vlan_stripping_config = dwxgmac2_rx_vlan_stripping_config,
+	.reset_rxp = dwxgmac_reset_rxp,
+	.rxf_and_acl_mem_reset = dwxgmac2_rxf_and_acl_mem_reset,
+};
+
+static u32 dwxgmac2_get_num_vlan(void __iomem *ioaddr)
+{
+	u32 val, num_vlan;
+
+	val = readl(ioaddr + XGMAC_HW_FEATURE3);
+	switch (val & XGMAC_HWFEAT_NRVF) {
+	case 0:
+		num_vlan = 1;
+		break;
+	case 1:
+		num_vlan = 4;
+		break;
+	case 2:
+		num_vlan = 8;
+		break;
+	case 3:
+		num_vlan = 16;
+		break;
+	case 4:
+		num_vlan = 24;
+		break;
+	case 5:
+		num_vlan = 32;
+		break;
+	case 7:
+		num_vlan = 4096;
+		break;
+	default:
+		num_vlan = 1;
+	}
+	return num_vlan;
+}
+
+int dwxgmac2_setup(struct dn200_priv *priv)
+{
+	struct mac_device_info *mac = priv->hw;
+
+	mac->pcsr = priv->ioaddr;
+	mac->pmail = priv->plat_ex->pf.ioaddr;
+	mac->multicast_filter_bins = priv->plat->multicast_filter_bins;
+	mac->unicast_filter_entries = priv->plat->unicast_filter_entries;
+	mac->mcast_bits_log2 = 0;
+
+	if (mac->multicast_filter_bins)
+		mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins);
+
+	mac->link.duplex = 0;
+	mac->link.speed10 = XGMAC_CONFIG_SS_10_MII;
+	mac->link.speed100 = XGMAC_CONFIG_SS_100_MII;
+	mac->link.speed1000 = XGMAC_CONFIG_SS_1000_GMII;
+	mac->link.speed2500 = XGMAC_CONFIG_SS_2500_GMII;
+	mac->link.xgmii.speed2500 = XGMAC_CONFIG_SS_2500;
+	mac->link.xgmii.speed5000 = XGMAC_CONFIG_SS_5000;
+	mac->link.xgmii.speed10000 = XGMAC_CONFIG_SS_10000;
+	mac->link.speed_mask = XGMAC_CONFIG_SS_MASK;
+
+	mac->mii.addr = XGMAC_MDIO_ADDR;
+	mac->mii.data = XGMAC_MDIO_DATA;
+	mac->mii.addr_shift = 16;
+	mac->mii.addr_mask = GENMASK(20, 16);
+	mac->mii.reg_shift = 0;
+	mac->mii.reg_mask = GENMASK(15, 0);
+	mac->mii.clk_csr_shift = 19;
+	mac->mii.clk_csr_mask = GENMASK(21, 19);
+	if (HW_IS_PUREPF(priv->hw))
+		mac->max_vlan_num = dwxgmac2_get_num_vlan(priv->ioaddr);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/dwxgmac2_descs.c b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_descs.c
new file mode 100644
index 000000000000..ce7998ce4840
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_descs.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include "dn200.h"
+#include "common.h"
+#include "dwxgmac_comm.h"
+
+static int dwxgmac2_get_tx_status(void *data, struct dn200_extra_stats *x,
+				  struct dma_desc *p, void __iomem *ioaddr)
+{
+	unsigned int tdes3 = le32_to_cpu(p->des3);
+	int ret = tx_done;
+
+	if (unlikely(tdes3 & XGMAC_TDES3_OWN))
+		return tx_dma_own;
+	if (likely(!(tdes3 & XGMAC_TDES3_LD)))
+		return tx_not_ls;
+
+	return ret;
+}
+
+static int dwxgmac2_get_rx_err_status(struct dn200_extra_stats *x,
+				      struct dma_desc *p, bool rec_all,
+				      bool csum_err_flag)
+{
+	unsigned int rdes3 = le32_to_cpu(p->des3);
+	u8 err_result = (rdes3 & XGMAC_RDES3_COND) >> XGMAC_TDES3_CIC_SHIFT;
+	/*vxlan err */
+	if (csum_err_flag) {
+		/*if outer err,we need commit but not add */
+		if (err_result == outer_ip_header_err ||
+		    err_result == outer_l4_chksum_err) {
+			return good_frame;
+		}
+		if (err_result == inner_ip_header_err) {
+			x->rx_csum_err++;
+			return good_frame;
+		}
+		if (err_result == inner_l4_chksum_err) {
+			x->rx_csum_err++;
+			return good_frame;
+		}
+
+	} else { /*ordinary packet err */
+		if (err_result == ip_header_err) {
+			x->rx_csum_err++;
+			return good_frame;
+		}
+		if (err_result == l4_chksum_err) {
+			x->rx_csum_err++;
+			return good_frame;
+		}
+	}
+	if (!!rec_all)
+		return good_frame;
+	return discard_frame;
+}
+
+static int dwxgmac2_get_rx_status(void *data, struct dn200_extra_stats *x,
+				  struct dma_desc *p, bool rec_all)
+{
+	unsigned int rdes3 = le32_to_cpu(p->des3);
+	unsigned int rdes2 = le32_to_cpu(p->des2);
+	unsigned int csum_err_flag = 0;
+
+	if (rdes2 & XGMAC_RDES2_TNP)	/* tunnel packet */
+		csum_err_flag = 1;
+
+	if (unlikely(rdes3 & XGMAC_RDES3_OWN))
+		return dma_own;
+	if (unlikely(rdes3 & XGMAC_RDES3_CTXT))
+		return discard_frame;
+	if (likely(!(rdes3 & XGMAC_RDES3_LD)))
+		return rx_not_ls;
+	if (unlikely((rdes3 & XGMAC_RDES3_ES) && (rdes3 & XGMAC_RDES3_LD)))
+		return dwxgmac2_get_rx_err_status(x, p, rec_all, csum_err_flag);
+
+	return good_frame;
+}
+
+static int dwxgmac2_get_tx_len(struct dma_desc *p)
+{
+	return (le32_to_cpu(p->des2) & XGMAC_TDES2_B1L);
+}
+
+static int dwxgmac2_get_tx_owner(struct dma_desc *p)
+{
+	return (le32_to_cpu(p->des3) & XGMAC_TDES3_OWN) > 0;
+}
+
+static void dwxgmac2_set_tx_owner(struct dma_desc *p)
+{
+	p->des3 |= cpu_to_le32(XGMAC_TDES3_OWN);
+}
+
+static void dwxgmac2_set_rx_owner(struct dma_desc *p, int disable_rx_ic)
+{
+	p->des3 |= cpu_to_le32(XGMAC_RDES3_OWN);
+
+	// if (!disable_rx_ic)
+	//      p->des3 |= cpu_to_le32(XGMAC_RDES3_IOC);
+}
+
+static int dwxgmac2_get_tx_ls(struct dma_desc *p)
+{
+	return (le32_to_cpu(p->des3) & XGMAC_RDES3_LD) > 0;
+}
+
+static int dwxgmac2_get_rx_frame_len(struct dma_desc *p, int rx_coe)
+{
+	return (le32_to_cpu(p->des3) & XGMAC_RDES3_PL);
+}
+
+static void dwxgmac2_enable_tx_timestamp(struct dma_desc *p)
+{
+	p->des2 |= cpu_to_le32(XGMAC_TDES2_TTSE);
+}
+
+static int dwxgmac2_get_tx_timestamp_status(struct dma_desc *p)
+{
+	return 0;		/* Not supported */
+}
+
+static inline void dwxgmac2_get_timestamp(void *desc, u32 ats, u64 *ts)
+{
+	struct dma_desc *p = (struct dma_desc *)desc;
+	u64 ns = 0;
+
+	ns += le32_to_cpu(p->des1) * 1000000000ULL;
+	ns += le32_to_cpu(p->des0);
+
+	*ts = ns;
+}
+
+static int dwxgmac2_rx_check_timestamp(void *desc)
+{
+	struct dma_desc *p = (struct dma_desc *)desc;
+	unsigned int rdes3 = le32_to_cpu(p->des3);
+	bool desc_valid, ts_valid;
+
+	dma_rmb();
+
+	desc_valid = !(rdes3 & XGMAC_RDES3_OWN) && (rdes3 & XGMAC_RDES3_CTXT);
+	ts_valid = !(rdes3 & XGMAC_RDES3_TSD) && (rdes3 & XGMAC_RDES3_TSA);
+
+	if (likely(desc_valid && ts_valid)) {
+		if (p->des0 == 0xffffffff && p->des1 == 0xffffffff)
+			return -EINVAL;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int dwxgmac2_get_rx_timestamp_status(void *desc, void *next_desc,
+					    u32 ats)
+{
+	struct dma_desc *p = (struct dma_desc *)desc;
+	unsigned int rdes3 = le32_to_cpu(p->des3);
+	int ret = -EBUSY;
+
+	if (likely(rdes3 & XGMAC_RDES3_CDA))
+		ret = dwxgmac2_rx_check_timestamp(next_desc);
+
+	return !ret;
+}
+
+static void dwxgmac2_init_rx_desc(struct dma_desc *p, int disable_rx_ic,
+				  int mode, int end, int bfsize)
+{
+	dwxgmac2_set_rx_owner(p, disable_rx_ic);
+}
+
+static void dwxgmac2_init_tx_desc(struct dma_desc *p, int mode, int end)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+}
+
+static void dwxgmac2_prepare_tx_desc(struct dma_desc *p, int is_fs, int len,
+				     bool csum_flag, int mode, bool tx_own,
+				     bool ls, unsigned int tot_pkt_len)
+{
+	unsigned int tdes3 = le32_to_cpu(p->des3);
+
+	p->des2 |= cpu_to_le32(len & XGMAC_TDES2_B1L);
+
+	tdes3 |= tot_pkt_len & XGMAC_TDES3_FL;
+	if (is_fs)
+		tdes3 |= XGMAC_TDES3_FD;
+	else
+		tdes3 &= ~XGMAC_TDES3_FD;
+
+	if (csum_flag)
+		tdes3 |= 0x3 << XGMAC_TDES3_CIC_SHIFT;
+	else
+		tdes3 &= ~XGMAC_TDES3_CIC;
+
+	if (ls)
+		tdes3 |= XGMAC_TDES3_LD;
+	else
+		tdes3 &= ~XGMAC_TDES3_LD;
+
+	/* Finally set the OWN bit. Later the DMA will start! */
+	if (tx_own)
+		tdes3 |= XGMAC_TDES3_OWN;
+
+	if (is_fs && tx_own)
+		/* When the own bit, for the first frame, has to be set, all
+		 * descriptors for the same frame has to be set before, to
+		 * avoid race condition.
+		 */
+		dma_wmb();
+
+	p->des3 = cpu_to_le32(tdes3);
+}
+
+static void dwxgmac2_prepare_tso_tx_desc(struct dma_desc *p, int tso_flag,
+					 int len1, int len2, bool tx_own,
+					 bool ls, unsigned int tcphdrlen,
+					 unsigned int tcppayloadlen)
+{
+	unsigned int tdes3 = le32_to_cpu(p->des3);
+
+	if (len1)
+		p->des2 |= cpu_to_le32(len1 & XGMAC_TDES2_B1L);
+	if (len2)
+		p->des2 |= cpu_to_le32((len2 << XGMAC_TDES2_B2L_SHIFT) &
+				       XGMAC_TDES2_B2L);
+	if (tso_flag & TSO_DESC_IS_FIRST) {
+		tdes3 |= (XGMAC_TDES3_FD | XGMAC_TDES3_TSE);
+
+		tdes3 |= (tcphdrlen << XGMAC_TDES3_THL_SHIFT) & XGMAC_TDES3_THL;
+		tdes3 |= tcppayloadlen & XGMAC_TDES3_TPL;
+	} else {
+		tdes3 &= ~XGMAC_TDES3_FD;
+	}
+
+	if (tso_flag & TSO_DESC_IS_TUNNEL)
+		tdes3 |= (3 << XGMAC_TDES3_SAIC_SHIFT) & XGMAC_TDES3_SAIC;
+
+	if (ls)
+		tdes3 |= XGMAC_TDES3_LD;
+	else
+		tdes3 &= ~XGMAC_TDES3_LD;
+
+	/* Finally set the OWN bit. Later the DMA will start! */
+	if (tx_own)
+		tdes3 |= XGMAC_TDES3_OWN;
+
+	if ((tso_flag & TSO_DESC_IS_FIRST) && tx_own)
+		/* When the own bit, for the first frame, has to be set, all
+		 * descriptors for the same frame has to be set before, to
+		 * avoid race condition.
+		 */
+		dma_wmb();
+
+	p->des3 = cpu_to_le32(tdes3);
+}
+
+static void dwxgmac2_release_tx_desc(struct dma_desc *p, int mode)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+}
+
+static void dwxgmac2_set_tx_ic(struct dma_desc *p)
+{
+	p->des2 |= cpu_to_le32(XGMAC_TDES2_IOC);
+}
+
+static void dwxgmac2_set_mss(struct dma_desc *p, unsigned int mss)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = cpu_to_le32(mss);
+	p->des3 = cpu_to_le32(XGMAC_TDES3_CTXT | XGMAC_TDES3_TCMSSV);
+}
+
+static void dwxgmac2_get_addr(struct dma_desc *p, unsigned int *addr)
+{
+	*addr = le32_to_cpu(p->des0);
+}
+
+static void dwxgmac2_set_addr(struct dma_desc *p, dma_addr_t addr,
+			      struct mac_device_info *hw)
+{
+	p->des0 = cpu_to_le32(lower_32_bits(addr));
+	p->des1 = cpu_to_le32(upper_32_bits(addr));
+}
+
+static void dwxgmac2_clear(struct dma_desc *p)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+}
+
+static int dwxgmac2_get_rx_hash(struct dma_desc *p, u32 *hash,
+				enum pkt_hash_types *type)
+{
+	unsigned int rdes3 = le32_to_cpu(p->des3);
+	u32 ptype;
+
+	if (rdes3 & XGMAC_RDES3_RSV) {
+		ptype = (rdes3 & XGMAC_RDES3_L34T) >> XGMAC_RDES3_L34T_SHIFT;
+
+		switch (ptype) {
+		case XGMAC_L34T_IP4TCP:
+		case XGMAC_L34T_IP4UDP:
+		case XGMAC_L34T_IP6TCP:
+		case XGMAC_L34T_IP6UDP:
+			*type = PKT_HASH_TYPE_L4;
+			break;
+		default:
+			*type = PKT_HASH_TYPE_L3;
+			break;
+		}
+
+		*hash = le32_to_cpu(p->des1);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int dwxgmac2_get_ovt(struct dma_desc *p)
+{
+	unsigned int rdes0 = le32_to_cpu(p->des0);
+	unsigned int rdes2 = le32_to_cpu(p->des2);
+
+	if (rdes2 & XGMAC_RDES2_TNP)
+		return 0;
+
+	return (rdes0 & XGMAC_RDES0_OVT);
+}
+
+static void dwxgmac2_get_rx_header_len(struct dma_desc *p, unsigned int *len)
+{
+	if (le32_to_cpu(p->des3) & XGMAC_RDES3_L34T)
+		*len = le32_to_cpu(p->des2) & XGMAC_RDES2_HL;
+}
+
+static void dwxgmac2_set_sec_addr(struct dma_desc *p, dma_addr_t addr,
+				  bool is_valid, struct mac_device_info *hw)
+{
+	p->des2 = cpu_to_le32(lower_32_bits(addr));
+	p->des3 = cpu_to_le32(upper_32_bits(addr));
+}
+
+static void dwxgmac2_set_sarc(struct dma_desc *p, u32 sarc_type)
+{
+	sarc_type <<= XGMAC_TDES3_SAIC_SHIFT;
+	p->des3 |= cpu_to_le32(sarc_type & XGMAC_TDES3_SAIC);
+}
+
+static void dwxgmac2_set_vxlan(struct dma_desc *p)
+{
+	p->des3 |= GENMASK(24, 23);
+	p->des3 &= (~BIT(25));
+}
+
+static void dwxgmac2_set_vlan_tag(struct dma_desc *p, u16 tag, u16 inner_tag,
+				  u32 inner_type)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+
+	/* Inner VLAN */
+	if (inner_type) {
+		u32 des = inner_tag << XGMAC_TDES2_IVT_SHIFT;
+
+		des &= XGMAC_TDES2_IVT;
+		p->des2 = cpu_to_le32(des);
+
+		des = inner_type << XGMAC_TDES3_IVTIR_SHIFT;
+		des &= XGMAC_TDES3_IVTIR;
+		p->des3 = cpu_to_le32(des | XGMAC_TDES3_IVLTV);
+	}
+
+	/* Outer VLAN */
+	p->des3 |= cpu_to_le32(tag & XGMAC_TDES3_VT);
+	p->des3 |= cpu_to_le32(XGMAC_TDES3_VLTV);
+
+	p->des3 |= cpu_to_le32(XGMAC_TDES3_CTXT);
+}
+
+static void dwxgmac2_set_vlan(struct dma_desc *p, u32 type)
+{
+	type <<= XGMAC_TDES2_VTIR_SHIFT;
+	p->des2 |= cpu_to_le32(type & XGMAC_TDES2_VTIR);
+}
+
+static void dwxgmac2_display_ring(void *head, unsigned int size, bool not_tbl,
+				  dma_addr_t dma_rx_phy, unsigned int desc_size,
+				  struct mac_device_info *hw)
+{
+	int i = 0;
+	dma_addr_t dma_addr;
+	struct dma_desc *p;
+
+	if (desc_size != sizeof(struct dma_desc))
+		return;
+	p = (struct dma_desc *)head;
+
+	for (i = 0; i < size; i++) {
+		dma_addr = dma_rx_phy + i * desc_size;
+		if (desc_size == sizeof(struct dma_desc)) {
+			dev_info(hw->priv->device,
+				 "%d [%pad]: 0x%x 0x%x 0x%x 0x%x\n", i,
+				 &dma_addr, le32_to_cpu(p->des0),
+				 le32_to_cpu(p->des1), le32_to_cpu(p->des2),
+				 le32_to_cpu(p->des3));
+			p++;
+		}
+		dev_info(hw->priv->device, "\n");
+	}
+}
+
+const struct dn200_desc_ops dwxgmac210_desc_ops = {
+	.tx_status = dwxgmac2_get_tx_status,
+	.rx_status = dwxgmac2_get_rx_status,
+	.get_tx_len = dwxgmac2_get_tx_len,
+	.get_tx_owner = dwxgmac2_get_tx_owner,
+	.set_tx_owner = dwxgmac2_set_tx_owner,
+	.set_rx_owner = dwxgmac2_set_rx_owner,
+	.get_tx_ls = dwxgmac2_get_tx_ls,
+	.get_rx_frame_len = dwxgmac2_get_rx_frame_len,
+	.enable_tx_timestamp = dwxgmac2_enable_tx_timestamp,
+	.get_tx_timestamp_status = dwxgmac2_get_tx_timestamp_status,
+	.get_rx_timestamp_status = dwxgmac2_get_rx_timestamp_status,
+	.get_timestamp = dwxgmac2_get_timestamp,
+	.set_tx_ic = dwxgmac2_set_tx_ic,
+	.prepare_tx_desc = dwxgmac2_prepare_tx_desc,
+	.prepare_tso_tx_desc = dwxgmac2_prepare_tso_tx_desc,
+	.release_tx_desc = dwxgmac2_release_tx_desc,
+	.init_rx_desc = dwxgmac2_init_rx_desc,
+	.init_tx_desc = dwxgmac2_init_tx_desc,
+	.set_mss = dwxgmac2_set_mss,
+	.get_addr = dwxgmac2_get_addr,
+	.set_addr = dwxgmac2_set_addr,
+	.clear = dwxgmac2_clear,
+	.get_rx_hash = dwxgmac2_get_rx_hash,
+	.get_rx_header_len = dwxgmac2_get_rx_header_len,
+	.set_sec_addr = dwxgmac2_set_sec_addr,
+	.set_sarc = dwxgmac2_set_sarc,
+	.set_vlan_tag = dwxgmac2_set_vlan_tag,
+	.set_vlan = dwxgmac2_set_vlan,
+	.get_ovt = dwxgmac2_get_ovt,
+	.set_vxlan = dwxgmac2_set_vxlan,
+	.display_ring = dwxgmac2_display_ring,
+};
diff --git a/drivers/net/ethernet/dapustor/dn200/dwxgmac2_dma.c b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_dma.c
new file mode 100644
index 000000000000..ab73a51331ad
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dwxgmac2_dma.c
@@ -0,0 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/iopoll.h>
+#include "dn200.h"
+#include "dwxgmac_comm.h"
+
+static int dwxgmac2_dma_reset(void __iomem *ioaddr, struct mac_device_info *hw)
+{
+	u32 value = readl(ioaddr + XGMAC_DMA_MODE);
+
+	if (HW_IS_VF(hw))
+		return 0;
+	/* DMA SW reset */
+	writel(value | XGMAC_SWR, ioaddr + XGMAC_DMA_MODE);
+
+	return readl_poll_timeout_atomic(ioaddr + XGMAC_DMA_MODE, value,
+					 !(value & XGMAC_SWR), 10, 1000000);
+}
+
+static void dwxgmac2_dma_init(void __iomem *ioaddr,
+			      struct dn200_dma_cfg *dma_cfg, int atds,
+			      struct mac_device_info *hw)
+{
+	u32 value = readl(ioaddr + XGMAC_DMA_SYSBUS_MODE);
+
+	if (HW_IS_VF(hw))
+		return;
+	if (dma_cfg->aal)
+		value |= XGMAC_AAL;
+
+	if (dma_cfg->eame)
+		value |= XGMAC_EAME;
+	if (dma_cfg->onekbbe)
+		value |= XGMAC_ONEKBBE;
+
+	writel(value, ioaddr + XGMAC_DMA_SYSBUS_MODE);
+}
+
+static void dwxgmac2_dma_init_chan(void __iomem *ioaddr,
+				   struct dn200_dma_cfg *dma_cfg, u32 chan,
+				   struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_CONTROL(chan));
+	if (dma_cfg->pblx8)
+		value |= XGMAC_PBLx8;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_CONTROL(chan));
+	writel(XGMAC_DMA_INT_DEFAULT_EN, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+}
+
+static void dwxgmac2_dma_init_rx_chan(void __iomem *ioaddr,
+				      struct dn200_dma_cfg *dma_cfg,
+				      dma_addr_t phy, u32 chan,
+				      struct mac_device_info *hw)
+{
+	u32 rxpbl = dma_cfg->rxpbl ? : dma_cfg->pbl;
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+	value &= ~XGMAC_RxPBL;
+	value |= (rxpbl << XGMAC_RxPBL_SHIFT) & XGMAC_RxPBL;
+	writel(value, ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+
+	writel(upper_32_bits(phy), ioaddr + XGMAC_DMA_CH_RxDESC_HADDR(chan));
+	writel(lower_32_bits(phy), ioaddr + XGMAC_DMA_CH_RxDESC_LADDR(chan));
+}
+
+static void dwxgmac2_dma_init_tx_chan(void __iomem *ioaddr,
+				      struct dn200_dma_cfg *dma_cfg,
+				      dma_addr_t phy, u32 chan,
+				      struct mac_device_info *hw)
+{
+	u32 txpbl = dma_cfg->txpbl ? : dma_cfg->pbl;
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	value &= ~XGMAC_TxPBL;
+	value |= (txpbl << XGMAC_TxPBL_SHIFT) & XGMAC_TxPBL;
+	value |= XGMAC_OSP;
+	writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+
+	writel(upper_32_bits(phy), ioaddr + XGMAC_DMA_CH_TxDESC_HADDR(chan));
+	writel(lower_32_bits(phy), ioaddr + XGMAC_DMA_CH_TxDESC_LADDR(chan));
+}
+
+static void dwxgmac2_dma_axi(void __iomem *ioaddr, struct dn200_axi *axi,
+			     struct mac_device_info *hw)
+{
+	u32 value = readl(ioaddr + XGMAC_DMA_SYSBUS_MODE);
+	int i;
+
+	if (HW_IS_VF(hw))
+		return;
+	if (axi->axi_lpi_en)
+		value |= XGMAC_EN_LPI;
+	if (axi->axi_xit_frm)
+		value |= XGMAC_LPI_XIT_PKT;
+
+	value &= ~XGMAC_WR_OSR_LMT;
+	value |= (axi->axi_wr_osr_lmt << XGMAC_WR_OSR_LMT_SHIFT) &
+	    XGMAC_WR_OSR_LMT;
+
+	value &= ~XGMAC_RD_OSR_LMT;
+	value |= (axi->axi_rd_osr_lmt << XGMAC_RD_OSR_LMT_SHIFT) &
+	    XGMAC_RD_OSR_LMT;
+
+	if (!axi->axi_fb)
+		value |= XGMAC_UNDEF;
+
+	value &= ~XGMAC_BLEN;
+	for (i = 0; i < AXI_BLEN; i++) {
+		switch (axi->axi_blen[i]) {
+		case 256:
+			value |= XGMAC_BLEN256;
+			break;
+		case 128:
+			value |= XGMAC_BLEN128;
+			break;
+		case 64:
+			value |= XGMAC_BLEN64;
+			break;
+		case 32:
+			value |= XGMAC_BLEN32;
+			break;
+		case 16:
+			value |= XGMAC_BLEN16;
+			break;
+		case 8:
+			value |= XGMAC_BLEN8;
+			break;
+		case 4:
+			value |= XGMAC_BLEN4;
+			break;
+		}
+	}
+
+	writel(value, ioaddr + XGMAC_DMA_SYSBUS_MODE);
+	/* set tx & rx Descriptor Pre-fetch threshold Size as 0x5
+	 * 0x5: Threshold is 24 descriptors, descriptor pre-fetch is triggered
+	 *              when 24 descriptors are left in the cache.
+	 */
+	writel(XGMAC_TDPS & 0x5, ioaddr + XGMAC_TX_EDMA_CTRL);
+	writel(XGMAC_RDPS & 0x4, ioaddr + XGMAC_RX_EDMA_CTRL);
+}
+
+static void dwxgmac2_dma_dump_regs(void __iomem *ioaddr, u32 *reg_space)
+{
+	int i;
+
+	for (i = (XGMAC_DMA_MODE / 4); i < XGMAC_REGSIZE; i++)
+		reg_space[i] = readl(ioaddr + i * 4);
+}
+
+static void dwxgmac2_dma_mode_reset(void __iomem *ioaddr, u8 channel,
+				    struct mac_device_info *hw)
+{
+	channel += DN200_RXQ_START_GET(hw);
+	writel(0, ioaddr + XGMAC_MTL_RXQ_FLOW_CONTROL(channel));
+	writel(0, ioaddr + XGMAC_MTL_QINTEN(channel));
+	writel(0, ioaddr + XGMAC_MTL_TXQ_OPMODE(channel));
+	writel(0, ioaddr + XGMAC_DMA_CH_RX_CONTROL(channel));
+	writel(0, ioaddr + XGMAC_DMA_CH_TX_CONTROL(channel));
+}
+
+static void dwxgmac2_dma_rx_all_set(void __iomem *ioaddr, u32 channel,
+				    u8 enable, struct mac_device_info *hw)
+{
+	u32 value, mode;
+
+	if (HW_IS_VF(hw))
+		return;
+	if (!DN200_MTL_QUEUE_IS_VALID(hw->priv, channel))
+		return;
+
+	mode = XGMAC_MTL_RXQ_OPMODE_FUF | XGMAC_MTL_RXQ_OPMODE_FEF;
+	if (enable) {
+		value = readl(ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+		writel(value | mode, ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+	} else {
+		value = readl(ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+		value &= ~mode;
+		writel(value, ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+	}
+}
+
+static void dwxgmac2_dma_rx_mode(void __iomem *ioaddr, int mode,
+				 u32 channel, int fifosz, u8 qmode,
+				 struct mac_device_info *hw)
+{
+	u32 value;
+	unsigned int rqs = fifosz / 256 - 1;
+
+	if (HW_IS_VF(hw))
+		return;
+	channel += DN200_RXQ_START_GET(hw);
+	if (!DN200_MTL_QUEUE_IS_VALID(hw->priv, channel))
+		return;
+	value = readl(ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+
+	if (mode == SF_DMA_MODE) {
+		value |= XGMAC_RSF;
+	} else {
+		value &= ~XGMAC_RSF;
+		value &= ~XGMAC_RTC;
+
+		if (mode <= 64)
+			value |= 0x0 << XGMAC_RTC_SHIFT;
+		else if (mode <= 96)
+			value |= 0x2 << XGMAC_RTC_SHIFT;
+		else
+			value |= 0x3 << XGMAC_RTC_SHIFT;
+	}
+
+	value &= ~XGMAC_RQS;
+	value |= (rqs << XGMAC_RQS_SHIFT) & XGMAC_RQS;
+
+	if (fifosz >= 4096 && qmode != MTL_QUEUE_AVB) {
+		u32 flow = readl(ioaddr + XGMAC_MTL_RXQ_FLOW_CONTROL(channel));
+		unsigned int rfd, rfa;
+
+		value |= XGMAC_EHFC;
+
+		/* Set Threshold for Activating Flow Control to min 2 frames,
+		 * i.e. 1500 * 2 = 3000 bytes.
+		 *
+		 * Set Threshold for Deactivating Flow Control to min 1 frame,
+		 * i.e. 1500 bytes.
+		 */
+		switch (fifosz) {
+		case 4096:
+			/* This violates the above formula because of FIFO size
+			 * limit therefore overflow may occur in spite of this.
+			 */
+			rfd = 0x03;	/* Full-2.5K */
+			rfa = 0x01;	/* Full-1.5K */
+			break;
+			/* This violates the above formula because of
+			 * all untag pkts ingress to q15 that have sup fifo size
+			 * q15 will active tx pause when fifo just can store one jumbo pkt,
+			 * will deactivate tx pause when have low fifo size.
+			 */
+		case DN200_RX_SUPER_FIFO_SIZE:
+			if (DN200_RX_SUPER_FIFO_SIZE < (20 * 1024))
+				dev_err(hw->priv->device, "supper fize size is too small:%d, at least 20K",
+					DN200_RX_SUPER_FIFO_SIZE);
+			rfd = 0x19;	/* (20K)-13.5K(25 * 0.5 + 1 = 13.5) = 6.5K (deactivate) */
+			rfa = 0x12;	/* Full(20K)-10K (18 * 0.5 + 1 = 10K) = 10K (active) */
+			break;
+
+		default:
+			rfd = 0x07;	/* Full-4.5K */
+			rfa = 0x04;	/* Full-3K */
+			break;
+		}
+
+		flow &= ~XGMAC_RFD;
+		flow |= rfd << XGMAC_RFD_SHIFT;
+
+		flow &= ~XGMAC_RFA;
+		flow |= rfa << XGMAC_RFA_SHIFT;
+
+		writel(flow, ioaddr + XGMAC_MTL_RXQ_FLOW_CONTROL(channel));
+	}
+
+	writel(value | XGMAC_MTL_RXQ_OPMODE_DIS_TCP_EF,
+	       ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
+	/* Enable MTL RX overflow */
+	value = readl(ioaddr + XGMAC_MTL_QINTEN(channel));
+	writel(value | XGMAC_RXOIE, ioaddr + XGMAC_MTL_QINTEN(channel));
+}
+
+static void dwxgmac2_dma_tx_mode(void __iomem *ioaddr, int mode,
+				 u32 channel, int fifosz, u8 qmode, u8 tc,
+				 struct mac_device_info *hw)
+{
+	unsigned int tqs = fifosz / 256 - 1;
+	u32 value;
+
+	if (HW_IS_VF(hw))
+		return;
+	channel += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_MTL_TXQ_OPMODE(channel));
+
+	if (mode == SF_DMA_MODE) {
+		value |= XGMAC_TSF;
+	} else {
+		value &= ~XGMAC_TSF;
+		value &= ~XGMAC_TTC;
+
+		if (mode <= 64)
+			value |= 0x0 << XGMAC_TTC_SHIFT;
+		else if (mode <= 96)
+			value |= 0x2 << XGMAC_TTC_SHIFT;
+		else if (mode <= 128)
+			value |= 0x3 << XGMAC_TTC_SHIFT;
+		else if (mode <= 192)
+			value |= 0x4 << XGMAC_TTC_SHIFT;
+		else if (mode <= 256)
+			value |= 0x5 << XGMAC_TTC_SHIFT;
+		else if (mode <= 384)
+			value |= 0x6 << XGMAC_TTC_SHIFT;
+		else
+			value |= 0x7 << XGMAC_TTC_SHIFT;
+	}
+
+	/* Use static TC to Queue mapping */
+	/*For PF/VF queue x map to tc x */
+	value |= (tc << XGMAC_Q2TCMAP_SHIFT) & XGMAC_Q2TCMAP;
+
+	value &= ~XGMAC_TXQEN;
+	if (qmode != MTL_QUEUE_AVB)
+		value |= 0x2 << XGMAC_TXQEN_SHIFT;
+	else
+		value |= 0x1 << XGMAC_TXQEN_SHIFT;
+
+	value &= ~XGMAC_TQS;
+	value |= (tqs << XGMAC_TQS_SHIFT) & XGMAC_TQS;
+
+	writel(value, ioaddr + XGMAC_MTL_TXQ_OPMODE(channel));
+}
+
+static void dwxgmac2_enable_dma_irq(void __iomem *ioaddr, u32 chan,
+				    bool rx, bool tx,
+				    struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+	if (rx)
+		value |= XGMAC_DMA_INT_DEFAULT_RX;
+	if (tx)
+		value |= XGMAC_DMA_INT_DEFAULT_TX;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+}
+
+static void dwxgmac2_disable_dma_irq(void __iomem *ioaddr, u32 chan,
+				     bool rx, bool tx,
+				     struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+	if (rx)
+		value &= ~XGMAC_DMA_INT_DEFAULT_RX;
+	if (tx)
+		value &= ~XGMAC_DMA_INT_DEFAULT_TX;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+}
+
+static void dwxgmac2_dma_start_tx(void __iomem *ioaddr, u32 chan,
+				  struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	value |= XGMAC_TXST;
+	writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+
+	value = readl(ioaddr + XGMAC_TX_CONFIG);
+	value |= XGMAC_CONFIG_TE;
+	writel(value, ioaddr + XGMAC_TX_CONFIG);
+}
+
+static void dwxgmac2_dma_stop_tx(void __iomem *ioaddr, u32 chan,
+				 struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	udelay(500);
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	value &= ~XGMAC_TXST;
+	writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+
+	if (HW_IS_VF(hw))
+		return;
+	value = readl(ioaddr + XGMAC_TX_CONFIG);
+	value &= ~XGMAC_CONFIG_TE;
+	writel(value, ioaddr + XGMAC_TX_CONFIG);
+}
+
+static void dwxgmac2_dma_start_rx(void __iomem *ioaddr, u32 chan,
+				  struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+	value |= XGMAC_RXST;
+	writel(value, ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+}
+
+static void dwxgmac2_dma_stop_rx(void __iomem *ioaddr, u32 chan,
+				 struct mac_device_info *hw)
+{
+	u32 value;
+
+	udelay(500);
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+	value &= ~XGMAC_RXST;
+	writel(value, ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+}
+
+static int dwxgmac2_dma_interrupt(void __iomem *ioaddr,
+				  struct dn200_extra_stats *x, u32 chan,
+				  u32 dir, struct mac_device_info *hw)
+{
+	u32 intr_status, intr_en;
+	int ret = 0;
+
+	chan += DN200_RXQ_START_GET(hw);
+	intr_status = readl(ioaddr + XGMAC_DMA_CH_STATUS(chan));
+	intr_en = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+
+	if (dir == DMA_DIR_RX)
+		intr_status &= XGMAC_DMA_STATUS_MSK_RX;
+	else if (dir == DMA_DIR_TX)
+		intr_status &= XGMAC_DMA_STATUS_MSK_TX;
+
+	/* ABNORMAL interrupts */
+	if (unlikely(intr_status & XGMAC_AIS)) {
+		if (unlikely(intr_status & XGMAC_RBU)) {
+			x->rx_buf_unav_irq++;
+			ret |= handle_rx;
+		}
+		if (unlikely(intr_status & XGMAC_TPS)) {
+			x->tx_process_stopped_irq++;
+			ret |= tx_hard_error;
+		}
+		if (unlikely(intr_status & XGMAC_FBE)) {
+			x->fatal_bus_error_irq++;
+			ret |= tx_hard_error;
+		}
+	}
+
+	/* TX/RX NORMAL interrupts */
+	if (likely(intr_status & XGMAC_NIS)) {
+		x->normal_irq_n++;
+
+		if (likely(intr_status & XGMAC_RI)) {
+			x->rx_normal_irq_n++;
+			ret |= handle_rx;
+		}
+		if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) {
+			x->tx_normal_irq_n++;
+			ret |= handle_tx;
+		}
+	}
+
+	/* Clear interrupts */
+	writel(intr_en & intr_status, ioaddr + XGMAC_DMA_CH_STATUS(chan));
+
+	return ret;
+}
+
+static int dwxgmac2_get_hw_feature(void __iomem *ioaddr,
+				   struct dma_features *dma_cap)
+{
+	u32 hw_cap;
+
+	/*  MAC HW feature 0 */
+	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE0);
+	dma_cap->vlins = (hw_cap & XGMAC_HWFEAT_SAVLANINS) >> 27;
+	dma_cap->rx_coe = (hw_cap & XGMAC_HWFEAT_RXCOESEL) >> 16;
+	dma_cap->tx_coe = (hw_cap & XGMAC_HWFEAT_TXCOESEL) >> 14;
+	dma_cap->eee = (hw_cap & XGMAC_HWFEAT_EEESEL) >> 13;
+	dma_cap->atime_stamp = (hw_cap & XGMAC_HWFEAT_TSSEL) >> 12;
+	dma_cap->av = (hw_cap & XGMAC_HWFEAT_AVSEL) >> 11;
+	dma_cap->av &= !((hw_cap & XGMAC_HWFEAT_RAVSEL) >> 10);
+	dma_cap->arpoffsel = (hw_cap & XGMAC_HWFEAT_ARPOFFSEL) >> 9;
+	dma_cap->rmon = (hw_cap & XGMAC_HWFEAT_MMCSEL) >> 8;
+	dma_cap->pmt_magic_frame = (hw_cap & XGMAC_HWFEAT_MGKSEL) >> 7;
+	dma_cap->pmt_remote_wake_up = (hw_cap & XGMAC_HWFEAT_RWKSEL) >> 6;
+	dma_cap->vlhash = (hw_cap & XGMAC_HWFEAT_VLHASH) >> 4;
+	dma_cap->mbps_1000 = (hw_cap & XGMAC_HWFEAT_GMIISEL) >> 1;
+
+	/* MAC HW feature 1 */
+	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE1);
+	dma_cap->l3l4fnum = (hw_cap & XGMAC_HWFEAT_L3L4FNUM) >> 27;
+	if (dma_cap->l3l4fnum == 0xa)
+		dma_cap->l3l4fnum = 32;
+
+	dma_cap->hash_tb_sz = (hw_cap & XGMAC_HWFEAT_HASHTBLSZ) >> 24;
+	dma_cap->tc_cnt = ((hw_cap & XGMAC_HWFEAT_NUMTC) >> 21) + 1;
+	dma_cap->rssen = (hw_cap & XGMAC_HWFEAT_RSSEN) >> 20;
+	dma_cap->tsoen = (hw_cap & XGMAC_HWFEAT_TSOEN) >> 18;
+	dma_cap->sphen = (hw_cap & XGMAC_HWFEAT_SPHEN) >> 17;
+
+	dma_cap->addr64 = (hw_cap & XGMAC_HWFEAT_ADDR64) >> 14;
+	switch (dma_cap->addr64) {
+	case 0:
+		dma_cap->addr64 = 32;
+		break;
+	case 1:
+		dma_cap->addr64 = 40;
+		break;
+	case 2:
+		dma_cap->addr64 = 48;
+		break;
+	default:
+		dma_cap->addr64 = 32;
+		break;
+	}
+
+	dma_cap->tx_fifo_size =
+	    128 << ((hw_cap & XGMAC_HWFEAT_TXFIFOSIZE) >> 6);
+	dma_cap->rx_fifo_size =
+	    128 << ((hw_cap & XGMAC_HWFEAT_RXFIFOSIZE) >> 0);
+
+	/* MAC HW feature 2 */
+	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE2);
+	dma_cap->pps_out_num = (hw_cap & XGMAC_HWFEAT_PPSOUTNUM) >> 24;
+	dma_cap->number_tx_channel =
+	    ((hw_cap & XGMAC_HWFEAT_TXCHCNT) >> 18) + 1;
+	dma_cap->number_rx_channel =
+	    ((hw_cap & XGMAC_HWFEAT_RXCHCNT) >> 12) + 1;
+	dma_cap->number_tx_queues = ((hw_cap & XGMAC_HWFEAT_TXQCNT) >> 6) + 1;
+	dma_cap->number_rx_queues = ((hw_cap & XGMAC_HWFEAT_RXQCNT) >> 0) + 1;
+
+	/* MAC HW feature 3 */
+	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE3);
+	dma_cap->tbssel = (hw_cap & XGMAC_HWFEAT_TBSSEL) >> 27;
+	dma_cap->fpesel = (hw_cap & XGMAC_HWFEAT_FPESEL) >> 26;
+	dma_cap->estwid = (hw_cap & XGMAC_HWFEAT_ESTWID) >> 23;
+	dma_cap->estdep = (hw_cap & XGMAC_HWFEAT_ESTDEP) >> 20;
+	dma_cap->estsel = (hw_cap & XGMAC_HWFEAT_ESTSEL) >> 19;
+	dma_cap->dcben = (hw_cap & XGMAC_HWFEAT_DCBEN) >> 16;
+	dma_cap->asp = (hw_cap & XGMAC_HWFEAT_ASP) >> 14;
+	dma_cap->dvlan = (hw_cap & XGMAC_HWFEAT_DVLAN) >> 13;
+	dma_cap->frpes = (hw_cap & XGMAC_HWFEAT_FRPES) >> 11;
+	dma_cap->frpbs = (hw_cap & XGMAC_HWFEAT_FRPPB) >> 9;
+	dma_cap->frpsel = (hw_cap & XGMAC_HWFEAT_FRPSEL) >> 3;
+
+	return 0;
+}
+
+static void dwxgmac2_rx_watchdog(void __iomem *ioaddr, u32 riwt, u32 queue,
+				 struct mac_device_info *hw)
+{
+	queue += DN200_RXQ_START_GET(hw);
+	/* rx watchdog timer unit use b01(512) system clock cycles */
+	writel(((riwt & XGMAC_RWT) | (1 << XGMAC_RWTU_SHIFT)),
+			 ioaddr + XGMAC_DMA_CH_Rx_WATCHDOG(queue));
+}
+
+static void dwxgmac2_set_rx_ring_len(void __iomem *ioaddr, u32 len, u32 chan,
+				     struct mac_device_info *hw)
+{
+	chan += DN200_RXQ_START_GET(hw);
+	writel(len, ioaddr + XGMAC_DMA_CH_RxDESC_RING_LEN(chan));
+}
+
+static void dwxgmac2_set_tx_ring_len(void __iomem *ioaddr, u32 len, u32 chan,
+				     struct mac_device_info *hw)
+{
+	chan += DN200_RXQ_START_GET(hw);
+	writel(len, ioaddr + XGMAC_DMA_CH_TxDESC_RING_LEN(chan));
+}
+
+static void dwxgmac2_set_rx_tail_ptr(void __iomem *ioaddr, u32 ptr, u32 chan,
+				     struct mac_device_info *hw)
+{
+	chan += DN200_RXQ_START_GET(hw);
+	writel(ptr, ioaddr + XGMAC_DMA_CH_RxDESC_TAIL_LPTR(chan));
+}
+
+static u32 dwxgmac2_get_rx_curr_ptr(void __iomem *ioaddr, u32 chan,
+				    struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_RxDESC_CURR_LPTR(chan));
+
+	return value;
+}
+
+static void dwxgmac2_set_tx_tail_ptr(void __iomem *ioaddr, u32 ptr, u32 chan,
+				     struct mac_device_info *hw)
+{
+	chan += DN200_RXQ_START_GET(hw);
+	writel(ptr, ioaddr + XGMAC_DMA_CH_TxDESC_TAIL_LPTR(chan));
+}
+
+static void dwxgmac2_enable_tso(void __iomem *ioaddr, bool en, u32 chan,
+				struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	if (en)
+		value |= XGMAC_TSE;
+	else
+		value &= ~XGMAC_TSE;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+}
+
+static void dwxgmac2_qmode(void __iomem *ioaddr, u32 channel, u8 qmode,
+			   struct mac_device_info *hw)
+{
+	u32 value;
+	u32 flow = readl(ioaddr + XGMAC_RX_FLOW_CTRL);
+
+	channel += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_MTL_TXQ_OPMODE(channel));
+	value &= ~XGMAC_TXQEN;
+	if (qmode != MTL_QUEUE_AVB) {
+		value |= 0x2 << XGMAC_TXQEN_SHIFT;
+		writel(0, ioaddr + XGMAC_MTL_TCx_ETS_CONTROL(channel));
+	} else {
+		value |= 0x1 << XGMAC_TXQEN_SHIFT;
+		writel(flow & (~XGMAC_RFE), ioaddr + XGMAC_RX_FLOW_CTRL);
+	}
+
+	writel(value, ioaddr + XGMAC_MTL_TXQ_OPMODE(channel));
+}
+
+static void dwxgmac2_set_bfsize(void __iomem *ioaddr, int bfsize, u32 chan,
+				struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+	value &= ~XGMAC_RBSZ;
+	value |= bfsize << XGMAC_RBSZ_SHIFT;
+	writel(value, ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+}
+
+static void dwxgmac2_enable_sph(void __iomem *ioaddr, bool en, u32 chan,
+				struct mac_device_info *hw)
+{
+	u32 value = readl(ioaddr + XGMAC_RX_CONFIG);
+
+	chan += DN200_RXQ_START_GET(hw);
+	value &= ~XGMAC_CONFIG_HDSMS;
+	value |= XGMAC_CONFIG_HDSMS_256;	/* Segment max 256 bytes */
+	writel(value, ioaddr + XGMAC_RX_CONFIG);
+
+	value = readl(ioaddr + XGMAC_DMA_CH_CONTROL(chan));
+	if (en)
+		value |= XGMAC_SPH;
+	else
+		value &= ~XGMAC_SPH;
+	writel(value, ioaddr + XGMAC_DMA_CH_CONTROL(chan));
+}
+
+static int dwxgmac2_enable_tbs(void __iomem *ioaddr, bool en, u32 chan,
+			       struct mac_device_info *hw)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	if (en)
+		value |= XGMAC_EDSE;
+	else
+		value &= ~XGMAC_EDSE;
+
+	writel(value, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+
+	value = readl(ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan)) & XGMAC_EDSE;
+	if (en && !value)
+		return -EIO;
+
+	writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL0);
+	writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL1);
+	writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL2);
+	writel(XGMAC_DEF_FTOS, ioaddr + XGMAC_DMA_TBS_CTRL3);
+	return 0;
+}
+
+static void dwxgmac2_dma_reset_chan(void __iomem *ioaddr, u32 chan,
+				    struct mac_device_info *hw)
+{
+	int i;
+
+	chan += DN200_RXQ_START_GET(hw);
+	writel(0, ioaddr + XGMAC_DMA_CH_CONTROL(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_RX_CONTROL(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_TX_CONTROL(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_TxDESC_HADDR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_TxDESC_LADDR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_RxDESC_HADDR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_RxDESC_LADDR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_TxDESC_TAIL_LPTR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_RxDESC_TAIL_LPTR(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_TxDESC_RING_LEN(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_RxDESC_RING_LEN(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_INT_EN(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_Rx_WATCHDOG(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_STATUS(chan));
+	writel(0, ioaddr + XGMAC_DMA_CH_Rx_WATCHDOG(chan));
+	for (i = 0; i < XGMAC_PER_REGSIZE; i = i + 4)
+		writel(0, ioaddr + XGMAC_DMA_CH_CONTROL(chan) + i);
+}
+
+static int dwxgmac2_check_chan_status(void __iomem *ioaddr, u32 chan,
+				      struct mac_device_info *hw, bool is_tx)
+{
+	u32 value;
+
+	chan += DN200_RXQ_START_GET(hw);
+
+	value = readl(ioaddr + XGMAC_CH_DEBUG_ST(chan));
+	if (is_tx) {
+		value =
+		    (value &
+		     (XGMAC_TDWS | XGMAC_TDTS | XGMAC_TDRS | XGMAC_TDXS |
+		      XGMAC_TDFS));
+		if (value != 0)
+			return 1;
+	} else {
+		value = (value & (XGMAC_RDWS | XGMAC_RDTS | XGMAC_RDFS));
+		if (value != 0)
+			return 1;
+	}
+	return 0;
+}
+
+const struct dn200_dma_ops dwxgmac_dma_ops = {
+	.reset = dwxgmac2_dma_reset,
+	.init = dwxgmac2_dma_init,
+	.init_chan = dwxgmac2_dma_init_chan,
+	.init_rx_chan = dwxgmac2_dma_init_rx_chan,
+	.init_tx_chan = dwxgmac2_dma_init_tx_chan,
+	.axi = dwxgmac2_dma_axi,
+	.dump_regs = dwxgmac2_dma_dump_regs,
+	.dma_rx_mode = dwxgmac2_dma_rx_mode,
+	.dma_tx_mode = dwxgmac2_dma_tx_mode,
+	.dma_rx_all_set = dwxgmac2_dma_rx_all_set,
+	.dma_mode_reset = dwxgmac2_dma_mode_reset,
+	.enable_dma_irq = dwxgmac2_enable_dma_irq,
+	.disable_dma_irq = dwxgmac2_disable_dma_irq,
+	.start_tx = dwxgmac2_dma_start_tx,
+	.stop_tx = dwxgmac2_dma_stop_tx,
+	.start_rx = dwxgmac2_dma_start_rx,
+	.stop_rx = dwxgmac2_dma_stop_rx,
+	.dma_interrupt = dwxgmac2_dma_interrupt,
+	.get_hw_feature = dwxgmac2_get_hw_feature,
+	.rx_watchdog = dwxgmac2_rx_watchdog,
+	.set_rx_ring_len = dwxgmac2_set_rx_ring_len,
+	.set_tx_ring_len = dwxgmac2_set_tx_ring_len,
+	.set_rx_tail_ptr = dwxgmac2_set_rx_tail_ptr,
+	.set_tx_tail_ptr = dwxgmac2_set_tx_tail_ptr,
+	.get_rx_curr_ptr = dwxgmac2_get_rx_curr_ptr,
+	.enable_tso = dwxgmac2_enable_tso,
+	.qmode = dwxgmac2_qmode,
+	.set_bfsize = dwxgmac2_set_bfsize,
+	.enable_sph = dwxgmac2_enable_sph,
+	.enable_tbs = dwxgmac2_enable_tbs,
+	.dma_reset_chan = dwxgmac2_dma_reset_chan,
+	.check_chan_status = dwxgmac2_check_chan_status,
+};
diff --git a/drivers/net/ethernet/dapustor/dn200/dwxgmac_comm.h b/drivers/net/ethernet/dapustor/dn200/dwxgmac_comm.h
new file mode 100644
index 000000000000..306e24afee54
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/dwxgmac_comm.h
@@ -0,0 +1,630 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ *
+ */
+#ifndef __DWXGMAC_COMM_H__
+#define __DWXGMAC_COMM_H__
+#include "common.h"
+#include "dn200_self.h"
+
+#define DN200_RXQ_START_GET(hw) \
+	((hw)->priv->plat_ex->rx_queue_start)
+
+#define DN200_RXQ_END_GET(hw) \
+	((hw)->priv->plat_ex->rx_queue_start + (hw)->priv->plat->rx_queues_to_use - 1)
+
+#define DN200_TXQ_START_GET(hw) \
+	((hw)->priv->plat_ex->tx_queue_start)
+#define DN200_TXQ_END_GET(hw) \
+	((hw)->priv->plat_ex->tx_queue_start + (hw)->priv->plat->tx_queues_to_use - 1)
+
+#define DN200_SRIOV_PF_QUEUES_GET(hw) \
+	((hw)->priv->plat_ex->rx_queues_total -  (hw)->priv->plat_ex->rx_queues_reserved)
+
+/* Misc */
+#define XGMAC_JUMBO_LEN 16368
+
+/* MAC Registers */
+#define XGMAC_TX_CONFIG 0x00000000
+#define XGMAC_CONFIG_SS_OFF 29
+#define XGMAC_CONFIG_SS_MASK GENMASK(31, 29)
+#define XGMAC_CONFIG_SS_10000 (0x0 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_2500_GMII (0x2 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_1000_GMII (0x3 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_100_MII (0x4 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_5000 (0x5 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_2500 (0x6 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_SS_10_MII (0x7 << XGMAC_CONFIG_SS_OFF)
+#define XGMAC_CONFIG_VNE BIT(24)
+#define XGMAC_CONFIG_SARC GENMASK(22, 20)
+#define XGMAC_CONFIG_SARC_SHIFT 20
+#define XGMAC_CONFIG_JD BIT(16)
+#define XGMAC_CONFIG_TE BIT(0)
+#define XGMAC_CORE_INIT_TX (XGMAC_CONFIG_JD)
+#define XGMAC_RX_CONFIG 0x00000004
+#define XGMAC_CONFIG_ARPEN BIT(31)
+#define XGMAC_CONFIG_GPSL GENMASK(29, 16)
+#define XGMAC_CONFIG_GPSL_SHIFT 16
+#define XGMAC_CONFIG_HDSMS GENMASK(14, 12)
+#define XGMAC_CONFIG_HDSMS_SHIFT 12
+#define XGMAC_CONFIG_HDSMS_256 (0x2 << XGMAC_CONFIG_HDSMS_SHIFT)
+#define XGMAC_CONFIG_S2KP BIT(11)
+#define XGMAC_CONFIG_LM BIT(10)
+#define XGMAC_CONFIG_IPC BIT(9)
+#define XGMAC_CONFIG_JE BIT(8)
+#define XGMAC_CONFIG_WD BIT(7)
+#define XGMAC_CONFIG_GPSLCE BIT(6)
+#define XGMAC_CONFIG_CST BIT(2)
+#define XGMAC_CONFIG_ACS BIT(1)
+#define XGMAC_CONFIG_RE BIT(0)
+#define XGMAC_CORE_INIT_RX                       \
+	(XGMAC_CONFIG_GPSLCE | XGMAC_CONFIG_WD | \
+	 (XGMAC_JUMBO_LEN << XGMAC_CONFIG_GPSL_SHIFT))
+#define XGMAC_PACKET_FILTER 0x00000008
+#define XGMAC_FILTER_RA BIT(31)
+#define XGMAC_FILTER_IPFE BIT(20)
+#define XGMAC_FILTER_VTFE BIT(16)
+#define XGMAC_DHLFRS_MASK GENMASK(12, 11)
+#define XGMAC_DHLFRS_SHIFT 11
+#define XGMAC_FILTER_HPF BIT(10)
+#define XGMAC_FILTER_PCF GENMASK(7, 6)
+#define XGMAC_FILTER_PCF_SHIFT 6
+#define XGMAC_FILTER_PM BIT(4)
+#define XGMAC_FILTER_HMC BIT(2)
+#define XGMAC_FILTER_PR BIT(0)
+#define XGMAC_HASH_TABLE(x) (0x00000010 + (x) * 4)
+#define XGMAC_MAX_HASH_TABLE 8
+
+#define XGMAC_TUNNEL_IDENTIFIER 0xe0
+#define XGMAC_CONFIG_VNM BIT(25)
+#define XGMAC_FILTER_VUCC BIT(22)
+#define XGMAC_CONFIG_VNE BIT(24)
+
+#define XGMAC_VLAN_TAG 0x00000050
+#define XGMAC_VLAN_EDVLP BIT(26)
+#define XGMAC_VLAN_VTHM BIT(25)
+#define XGMAC_VLAN_TAG_CTRL_EVLRXS BIT(24)
+#define XGMAC_VLAN_TAG_CTRL_EVLS_MASK GENMASK(22, 21)
+#define XGMAC_VLAN_TAG_CTRL_EVLS_SHIFT 21
+#define XGMAC_VLAN_TAG_STRIP_NONE (0x0 << XGMAC_VLAN_TAG_CTRL_EVLS_SHIFT)
+#define XGMAC_VLAN_TAG_STRIP_PASS (0x1 << XGMAC_VLAN_TAG_CTRL_EVLS_SHIFT)
+#define XGMAC_VLAN_TAG_STRIP_FAIL (0x2 << XGMAC_VLAN_TAG_CTRL_EVLS_SHIFT)
+#define XGMAC_VLAN_TAG_STRIP_ALL (0x3 << XGMAC_VLAN_TAG_CTRL_EVLS_SHIFT)
+#define XGMAC_VLAN_DOVLTC BIT(20)
+#define XGMAC_VLAN_ERSVLM BIT(19)
+#define XGMAC_VLAN_ESVL BIT(18)
+#define XGMAC_VLAN_ETV BIT(16)
+#define XGMAC_VLAN_ERIVTL_MASK GENMASK(15, 14)
+#define XGMAC_VLAN_ERIVTL_SHIFT 14
+#define XGMAC_VLAN_EROVTL_MASK GENMASK(13, 12)
+#define XGMAC_VLAN_EROVTL_SHIFT 12
+#define XGMAC_VLAN_VID GENMASK(15, 0)
+#define XGMAC_VLAN_HASH_TABLE 0x00000058
+#define XGMAC_VLAN_INCL 0x00000060
+#define XGMAC_VLAN_VLTI BIT(20)
+#define XGMAC_VLAN_CSVL BIT(19)
+#define XGMAC_VLAN_VLC GENMASK(17, 16)
+#define XGMAC_VLAN_VLC_SHIFT 16
+#define XGMAC_VLAN_INNER_INCL 0x00000064
+#define XGMAC_VLAN_INNER_VLTI BIT(20)
+#define XGMAC_RVLAN_LKP_SIZE 0x00000068
+#define XGMAC_RXQ_CTRL0 0x000000a0
+#define XGMAC_RXQEN(x) GENMASK((x) * 2 + 1, (x) * 2)
+#define XGMAC_RXQEN_SHIFT(x) ((x) * 2)
+#define XGMAC_RXQ_CTRL1 0x000000a4
+#define XGMAC_MCBCQEN BIT(15)
+#define XGMAC_MCBCQ GENMASK(11, 8)
+#define XGMAC_MCBCQ_SHIFT 8
+#define XGMAC_RQ GENMASK(7, 4)
+#define XGMAC_RQ_SHIFT 4
+#define XGMAC_UPQ GENMASK(3, 0)
+#define XGMAC_RQ_SHIFT 4
+#define XGMAC_RXQ_CTRL2 0x000000a8
+#define XGMAC_RXQ_CTRL3 0x000000ac
+#define XGMAC_PSRQ(x) GENMASK((x) * 8 + 7, (x) * 8)
+#define XGMAC_RXQ_CTRL4 0x00000094
+#define XGMAC_UDC BIT(31)
+#define XGMAC_RXQ_CTRL5 0x00000098
+#define XGMAC_PRQSO GENMASK(3, 0)
+#define XGMAC_PSRQ_SHIFT(x) ((x) * 8)
+#define XGMAC_INT_STATUS 0x000000b0
+#define XGMAC_LPIIS BIT(5)
+#define XGMAC_PMTIS BIT(4)
+#define XGMAC_INT_EN 0x000000b4
+#define XGMAC_TSIE BIT(12)
+#define XGMAC_LPIIE BIT(5)
+#define XGMAC_PMTIE BIT(4)
+#define XGMAC_INT_DEFAULT_EN (XGMAC_PMTIE)
+#define XGMAC_Qx_TX_FLOW_CTRL(x) (0x00000070 + (x) * 4)
+#define XGMAC_PT GENMASK(31, 16)
+#define XGMAC_PT_SHIFT 16
+#define XGMAC_TFE BIT(1)
+#define XGMAC_RX_FLOW_CTRL 0x00000090
+#define XGMAC_PFCE BIT(8)
+#define XGMAC_RFE BIT(0)
+#define XGMAC_PMT 0x000000c0
+#define XGMAC_GLBLUCAST BIT(9)
+#define XGMAC_RWKPKTEN BIT(2)
+#define XGMAC_MGKPKTEN BIT(1)
+#define XGMAC_PWRDWN BIT(0)
+#define XGMAC_LPI_CTRL 0x000000d0
+#define XGMAC_TXCGE BIT(21)
+#define XGMAC_LPIATE BIT(20)
+#define XGMAC_LPITXA BIT(19)
+#define XGMAC_PLS BIT(17)
+#define XGMAC_LPITXEN BIT(16)
+#define XGMAC_RLPIEX BIT(3)
+#define XGMAC_RLPIEN BIT(2)
+#define XGMAC_TLPIEX BIT(1)
+#define XGMAC_TLPIEN BIT(0)
+#define XGMAC_LPI_TIMER_CTRL 0x000000d4
+#define XGMAC_TUNNEL_IDENTIFIER 0xe0
+#define XGMAC_CONFIG_VNM BIT(25)
+#define XGMAC_FILTER_VUCC BIT(22)
+#define XGMAC_CONFIG_VNE BIT(24)
+
+#define XGMAC_MAC_DEBUG 0x00000114
+#define XGMAC_MAC_TX_FIFO_ACT  GENMASK(31, 16)
+#define XGMAC_MAC_RX_FIFO_ACT  GENMASK(15, 0)
+#define XGMAC_HW_FEATURE0 0x0000011c
+#define XGMAC_HWFEAT_SAVLANINS BIT(27)
+#define XGMAC_HWFEAT_RXCOESEL BIT(16)
+#define XGMAC_HWFEAT_TXCOESEL BIT(14)
+#define XGMAC_HWFEAT_EEESEL BIT(13)
+#define XGMAC_HWFEAT_TSSEL BIT(12)
+#define XGMAC_HWFEAT_AVSEL BIT(11)
+#define XGMAC_HWFEAT_RAVSEL BIT(10)
+#define XGMAC_HWFEAT_ARPOFFSEL BIT(9)
+#define XGMAC_HWFEAT_MMCSEL BIT(8)
+#define XGMAC_HWFEAT_MGKSEL BIT(7)
+#define XGMAC_HWFEAT_RWKSEL BIT(6)
+#define XGMAC_HWFEAT_VLHASH BIT(4)
+#define XGMAC_HWFEAT_GMIISEL BIT(1)
+#define XGMAC_HW_FEATURE1 0x00000120
+#define XGMAC_HWFEAT_L3L4FNUM GENMASK(30, 27)
+#define XGMAC_HWFEAT_HASHTBLSZ GENMASK(25, 24)
+#define XGMAC_HWFEAT_NUMTC GENMASK(23, 21)
+#define XGMAC_HWFEAT_RSSEN BIT(20)
+#define XGMAC_HWFEAT_TSOEN BIT(18)
+#define XGMAC_HWFEAT_SPHEN BIT(17)
+#define XGMAC_HWFEAT_DCBEN BIT(16)
+#define XGMAC_HWFEAT_ADDR64 GENMASK(15, 14)
+#define XGMAC_HWFEAT_TXFIFOSIZE GENMASK(10, 6)
+#define XGMAC_HWFEAT_RXFIFOSIZE GENMASK(4, 0)
+#define XGMAC_HW_FEATURE2 0x00000124
+#define XGMAC_HWFEAT_PPSOUTNUM GENMASK(26, 24)
+#define XGMAC_HWFEAT_TXCHCNT GENMASK(21, 18)
+#define XGMAC_HWFEAT_RXCHCNT GENMASK(15, 12)
+#define XGMAC_HWFEAT_TXQCNT GENMASK(9, 6)
+#define XGMAC_HWFEAT_RXQCNT GENMASK(3, 0)
+#define XGMAC_HW_FEATURE3 0x00000128
+#define XGMAC_HWFEAT_TBSSEL BIT(27)
+#define XGMAC_HWFEAT_FPESEL BIT(26)
+#define XGMAC_HWFEAT_ESTWID GENMASK(24, 23)
+#define XGMAC_HWFEAT_ESTDEP GENMASK(22, 20)
+#define XGMAC_HWFEAT_ESTSEL BIT(19)
+#define XGMAC_HWFEAT_ASP GENMASK(15, 14)
+#define XGMAC_HWFEAT_DVLAN BIT(13)
+#define XGMAC_HWFEAT_FRPES GENMASK(12, 11)
+#define XGMAC_HWFEAT_FRPPB GENMASK(10, 9)
+#define XGMAC_HWFEAT_FRPSEL BIT(3)
+#define XGMAC_HWFEAT_NRVF GENMASK(2, 0)
+#define XGMAC_MACEXT_HD BIT(24)
+#define XGMAC_MAC_EXT_CONF 0x0140
+#define XGMAC_DDS_ENABLE BIT(7)
+#define XGMAC_MAC_DPP_FSM_INT_STATUS 0x00000150
+#define XGMAC_MAC_FSM_CONTROL 0x00000158
+#define XGMAC_PRTYEN BIT(1)
+#define XGMAC_TMOUTEN BIT(0)
+
+#define XGMAC_MDIO_ADDR 0x00010000
+#define XGMAC_MDIO_DATA 0x00010004
+#define XGMAC_MDIO_C22P 0x00010020
+#define XGMAC_MDIO_CHANNEL 0x00040008
+#define XGMAC_FPE_CTRL_STS 0x00000280
+
+#define XGMAC_INDIR_ACC_CTRL 0x700
+#define XGMAC_INDIR_ACC_MSEL GENMASK(29, 26)
+#define XGMAC_INDIR_ACC_MSEL_SHIFT 26
+#define XGMAC_INDIR_ACC_AOFF_SHIFT 8
+#define XGMAC_INDIR_ACC_COM BIT(1)
+#define XGMAC_INDIR_ACC_OB BIT(0)
+#define XGMAC_INDIR_ACC_DATA 0x704
+#define XGMAC_EXT_DA_RCH_SHIFT 2
+#define XGMAC_EXT_DA_RCHLE BIT(1)
+#define XGMAC_EXT_DA_AF BIT(0)
+
+#define XGMAC_EFPE BIT(0)
+#define XGMAC_ADDRX_HIGH(x) (0x00000300 + (x) * 0x8)
+#define XGMAC_ADDR_MAX 32
+#define XGMAC_AE BIT(31)
+#define XGMAC_DCS GENMASK(19, 16)
+#define XGMAC_DCS_SHIFT 16
+#define XGMAC_ADDRX_LOW(x) (0x00000304 + (x) * 0x8)
+#define XGMAC_INDIR_ACCESS_CTRL 0x00000700
+#define XGMAC_INDIR_DCHSEL			0x0
+#define XGMAC_INDIR_EXT_ROVTL 0x9
+#define XGMAC_INDIR_EXT_RIVTL 0xa
+#define XGMAC_INDIR_MSEL GENMASK(29, 26)
+#define XGMAC_INDIR_MSEL_SHIFT 26
+#define XGMAC_INDIR_AOFF GENMASK(23, 8)
+#define XGMAC_INDIR_AOFF_SHIFT 8
+#define XGMAC_INDIR_COM BIT(1)
+#define XGMAC_INDIR_OB BIT(0)
+#define XGMAC_INDRI_ACCESS_DATA 0x00000704
+#define XGMAC_L3L4_ADDR_CTRL 0x00000c00
+#define XGMAC_IDDR GENMASK(15, 8)
+#define XGMAC_IDDR_SHIFT 8
+#define XGMAC_IDDR_FNUM 4
+#define XGMAC_TT BIT(1)
+#define XGMAC_XB BIT(0)
+#define XGMAC_L3L4_DATA 0x00000c04
+#define XGMAC_L3L4_CTRL 0x0
+#define XGMAC_DMCHEN BIT(31)
+#define XGMAC_DMCHN GENMASK(27, 24)
+#define XGMAC_DMCHN_SHIFT (24)
+#define XGMAC_L4DPIM0 BIT(21)
+#define XGMAC_L4DPM0 BIT(20)
+#define XGMAC_L4SPIM0 BIT(19)
+#define XGMAC_L4SPM0 BIT(18)
+#define XGMAC_L4PEN0 BIT(16)
+#define XGMAC_L3HDBM0 GENMASK(15, 11)
+#define XGMAC_L3HDBM0_SHIFT (11)
+#define XGMAC_L3HSBM0_V6 GENMASK(12, 6)
+#define XGMAC_L3HSBM0 GENMASK(10, 6)
+#define XGMAC_L3HSBM0_SHIFT (6)
+#define XGMAC_L3DAIM0 BIT(5)
+#define XGMAC_L3DAM0 BIT(4)
+#define XGMAC_L3SAIM0 BIT(3)
+#define XGMAC_L3SAM0 BIT(2)
+#define XGMAC_L3PEN0 BIT(0)
+#define XGMAC_L4_ADDR 0x1
+#define XGMAC_L4DP0 GENMASK(31, 16)
+#define XGMAC_L4DP0_SHIFT 16
+#define XGMAC_L4SP0 GENMASK(15, 0)
+#define XGMAC_L3_ADDR0 0x4
+#define XGMAC_L3_ADDR1 0x5
+#define XGMAC_L3_ADDR2 0x6
+#define XMGAC_L3_ADDR3 0x7
+#define XGMAC_ARP_ADDR 0x00000c10
+#define XGMAC_RSS_CTRL 0x00000c80
+#define XGMAC_UDP4TE BIT(3)
+#define XGMAC_TCP4TE BIT(2)
+#define XGMAC_IP2TE BIT(1)
+#define XGMAC_RSSE BIT(0)
+#define XGMAC_RSS_ADDR 0x00000c88
+#define XGMAC_RSSIA_SHIFT 8
+#define XGMAC_ADDRT BIT(2)
+#define XGMAC_CT BIT(1)
+#define XGMAC_OB BIT(0)
+#define XGMAC_RSS_DATA 0x00000c8c
+#define XGMAC_TIMESTAMP_STATUS 0x00000d20
+#define XGMAC_TXTSC BIT(15)
+#define XGMAC_TXTIMESTAMP_NSEC 0x00000d30
+#define XGMAC_TXTSSTSLO GENMASK(30, 0)
+#define XGMAC_TXTIMESTAMP_SEC 0x00000d34
+#define XGMAC_PPS_CONTROL		0x00000d70
+#define XGMAC_PPS_MAXIDX(x)		((((x) + 1) * 8) - 1)
+#define XGMAC_PPS_MINIDX(x)		((x) * 8)
+#define XGMAC_PPSx_MASK(x)		\
+	GENMASK(XGMAC_PPS_MAXIDX(x), XGMAC_PPS_MINIDX(x))
+#define XGMAC_TRGTMODSELX(x, val)	\
+	(GENMASK(XGMAC_PPS_MAXIDX(x) - 1, XGMAC_PPS_MAXIDX(x) - 2) & \
+	((val) << (XGMAC_PPS_MAXIDX(x) - 2)))
+#define XGMAC_PPSCMDX(x, val)		\
+	(GENMASK(XGMAC_PPS_MINIDX(x) + 3, XGMAC_PPS_MINIDX(x)) & \
+	((val) << XGMAC_PPS_MINIDX(x)))
+#define XGMAC_PPSCMD_START		0x2
+#define XGMAC_PPSCMD_STOP		0x5
+#define XGMAC_PPSEN0			BIT(4)
+#define XGMAC_PPSx_TARGET_TIME_SEC(x)	(0x00000d80 + (x) * 0x10)
+#define XGMAC_PPSx_TARGET_TIME_NSEC(x)	(0x00000d84 + (x) * 0x10)
+#define XGMAC_TRGTBUSY0			BIT(31)
+#define XGMAC_PPSx_INTERVAL(x)		(0x00000d88 + (x) * 0x10)
+#define XGMAC_PPSx_WIDTH(x)		(0x00000d8c + (x) * 0x10)
+
+/* MTL Registers */
+#define XGMAC_MTL_OPMODE 0x00001000
+#define XGMAC_FRPE BIT(15)
+#define XGMAC_ETSALG GENMASK(6, 5)
+#define XGMAC_WRR (0x0 << 5)
+#define XGMAC_WFQ (0x1 << 5)
+#define XGMAC_DWRR (0x2 << 5)
+#define XGMAC_RAA BIT(2)
+#define XGMAC_MTL_INT_STATUS 0x00001020
+#define XGMAC_MTL_RXQ_DMA_MAP0 0x00001030
+#define XGMAC_MTL_RXQ_DMA_MAP1 0x00001034
+#define XGMAC_QxMDMACH(x) GENMASK((x) * 8 + 6, (x) * 8)
+#define XGMAC_QxMDMACH_DYN_SEL(x) BIT((x) * 8 + 7)
+#define XGMAC_QxMDMACH_SHIFT(x) ((x) * 8)
+#define XGMAC_QDDMACH BIT(7)
+#define XGMAC_TC_PRTY_MAP0 0x00001040
+#define XGMAC_TC_PRTY_MAP1 0x00001044
+#define XGMAC_PSTC(x) GENMASK((x) * 8 + 7, (x) * 8)
+#define XGMAC_PSTC_SHIFT(x) ((x) * 8)
+#define XGMAC_MTL_EST_CONTROL 0x00001050
+#define XGMAC_PTOV GENMASK(31, 23)
+#define XGMAC_PTOV_SHIFT 23
+#define XGMAC_SSWL BIT(1)
+#define XGMAC_EEST BIT(0)
+#define XGMAC_MTL_EST_GCL_CONTROL 0x00001080
+#define XGMAC_BTR_LOW 0x0
+#define XGMAC_BTR_HIGH 0x1
+#define XGMAC_CTR_LOW 0x2
+#define XGMAC_CTR_HIGH 0x3
+#define XGMAC_TER 0x4
+#define XGMAC_LLR 0x5
+#define XGMAC_ADDR_SHIFT 8
+#define XGMAC_GCRR BIT(2)
+#define XGMAC_SRWO BIT(0)
+#define XGMAC_MTL_EST_GCL_DATA 0x00001084
+#define XGMAC_MTL_RXP_CONTROL_STATUS 0x000010a0
+#define XGMAC_RXPI BIT(31)
+#define XGMAC_NPE GENMASK(23, 16)
+#define XGMAC_NVE GENMASK(7, 0)
+#define XGMAC_MTL_RXP_IACC_CTRL_ST 0x000010b0
+#define XGMAC_STARTBUSY BIT(31)
+#define XGMAC_WRRDN BIT(16)
+#define XGMAC_ADDR GENMASK(9, 0)
+#define XGMAC_MTL_RXP_IACC_DATA 0x000010b4
+#define XGMAC_MTL_ECC_CONTROL 0x000010c0
+#define XGMAC_MTL_SAFETY_INT_STATUS 0x000010c4
+#define XGMAC_MEUIS BIT(1)
+#define XGMAC_MECIS BIT(0)
+#define XGMAC_MTL_ECC_INT_ENABLE 0x000010c8
+#define XGMAC_RPCEIE BIT(12)
+#define XGMAC_ECEIE BIT(8)
+#define XGMAC_RXCEIE BIT(4)
+#define XGMAC_TXCEIE BIT(0)
+#define XGMAC_MTL_ECC_INT_STATUS 0x000010cc
+#define XGMAC_MTL_DPP_CONTROL 0x000010e0
+#define XGMAC_MTL_TXQ_OPMODE(x) (0x00001100 + (0x80 * (x)))
+#define XGMAC_TQS GENMASK(25, 16)
+#define XGMAC_TQS_SHIFT 16
+#define XGMAC_Q2TCMAP GENMASK(10, 8)
+#define XGMAC_Q2TCMAP_SHIFT 8
+#define XGMAC_TTC GENMASK(6, 4)
+#define XGMAC_TTC_SHIFT 4
+#define XGMAC_TXQEN GENMASK(3, 2)
+#define XGMAC_TXQEN_SHIFT 2
+#define XGMAC_TSF BIT(1)
+#define XGMAC_MTL_TXQ_DEBUG(x) (0x00001108 + (0x80 * (x)))
+#define XGMAC_TXQSTS BIT(4)
+#define XGMAC_TRCSTS GENMASK(2, 1)
+#define XGMAC_MTL_RXQ_DEBUG(x) (0x00001148 + (0x80 * (x)))
+#define XGMAC_RXQSTS GENMASK(5, 4)
+#define XGMAC_RXQSTS_SHIFT		4
+#define XGMAC_FC_OVER_TH		0x2
+#define XGMAC_FC_QUEUE_FULL		0x3
+#define XGMAC_MTL_TCx_ETS_CONTROL(x)	(0x00001110 + (0x80 * (x)))
+#define XGMAC_MTL_TCx_QUANTUM_WEIGHT(x) (0x00001118 + (0x80 * (x)))
+#define XGMAC_MTL_TCx_SENDSLOPE(x) (0x0000111c + (0x80 * (x)))
+#define XGMAC_MTL_TCx_HICREDIT(x) (0x00001120 + (0x80 * (x)))
+#define XGMAC_MTL_TCx_LOCREDIT(x) (0x00001124 + (0x80 * (x)))
+#define XGMAC_CC BIT(3)
+#define XGMAC_TSA GENMASK(1, 0)
+#define XGMAC_SP (0x0 << 0)
+#define XGMAC_CBS (0x1 << 0)
+#define XGMAC_ETS (0x2 << 0)
+#define XGMAC_MTL_RXQ_OPMODE(x) (0x00001140 + (0x80 * (x)))
+#define XGMAC_RQS GENMASK(25, 16)
+#define XGMAC_RQS_SHIFT 16
+#define XGMAC_EHFC BIT(7)
+#define XGMAC_MTL_RXQ_OPMODE_DIS_TCP_EF BIT(6)
+#define XGMAC_RSF BIT(5)
+#define XGMAC_MTL_RXQ_OPMODE_FEF BIT(4)
+#define XGMAC_MTL_RXQ_OPMODE_FUF BIT(3)
+#define XGMAC_RTC GENMASK(1, 0)
+#define XGMAC_RTC_SHIFT 0
+#define XGMAC_MTL_RXQ_DEBUG(x) (0x00001148 + (0x80 * (x)))
+#define XGMAC_PRXQ GENMASK(29, 16)
+#define XGMAC_RXQSTS GENMASK(5, 4)
+#define XGMAC_MTL_RXQ_WEIGHT(x) (0x0000114c + (0x80 * (x)))
+#define XGMAC_RXQ_WEIGHT GENMASK(2, 0)
+#define XGMAC_MTL_RXQ_FLOW_CONTROL(x) (0x00001150 + (0x80 * (x)))
+#define XGMAC_RFD GENMASK(31, 17)
+#define XGMAC_RFD_SHIFT 17
+#define XGMAC_RFA GENMASK(15, 1)
+#define XGMAC_RFA_SHIFT 1
+#define XGMAC_MTL_QINTEN(x) (0x00001170 + (0x80 * (x)))
+#define XGMAC_RXOIE BIT(16)
+#define XGMAC_MTL_QINT_STATUS(x) (0x00001174 + (0x80 * (x)))
+#define XGMAC_RXOVFIS BIT(16)
+#define XGMAC_ABPSIS BIT(1)
+#define XGMAC_TXUNFIS BIT(0)
+#define XGMAC_MAC_REGSIZE (XGMAC_MTL_QINT_STATUS(15) / 4)
+
+/* DMA Registers */
+#define XGMAC_DMA_MODE 0x00003000
+#define XGMAC_SWR BIT(0)
+#define XGMAC_DMA_SYSBUS_MODE 0x00003004
+#define XGMAC_WR_OSR_LMT GENMASK(29, 24)
+#define XGMAC_WR_OSR_LMT_SHIFT 24
+#define XGMAC_RD_OSR_LMT GENMASK(21, 16)
+#define XGMAC_RD_OSR_LMT_SHIFT 16
+#define XGMAC_EN_LPI BIT(15)
+#define XGMAC_LPI_XIT_PKT BIT(14)
+#define XGMAC_ONEKBBE BIT(13)
+#define XGMAC_AAL BIT(12)
+#define XGMAC_EAME BIT(11)
+#define XGMAC_BLEN GENMASK(7, 1)
+#define XGMAC_BLEN256 BIT(7)
+#define XGMAC_BLEN128 BIT(6)
+#define XGMAC_BLEN64 BIT(5)
+#define XGMAC_BLEN32 BIT(4)
+#define XGMAC_BLEN16 BIT(3)
+#define XGMAC_BLEN8 BIT(2)
+#define XGMAC_BLEN4 BIT(1)
+#define XGMAC_UNDEF BIT(0)
+#define XGMAC_DEBUG_ST0 0x00003020
+#define XGMAC_AXRHSTS BIT(1)
+#define XGMAC_AXWHSTS BIT(0)
+#define XGMAC_DEBUG_ST1 0x00003024
+#define XGMAC_TDAS(chan) (1 << (chan))
+#define XGMAC_DEBUG_ST3 0x0000302c
+#define XGMAC_TX_EDMA_CTRL 0x00003040
+#define XGMAC_TDPS GENMASK(29, 0)
+#define XGMAC_RX_EDMA_CTRL 0x00003044
+#define XGMAC_RDPS GENMASK(29, 0)
+#define XGMAC_DMA_TBS_CTRL0 0x00003054
+#define XGMAC_DMA_TBS_CTRL1 0x00003058
+#define XGMAC_DMA_TBS_CTRL2 0x0000305c
+#define XGMAC_DMA_TBS_CTRL3 0x00003060
+#define XGMAC_FTOS GENMASK(31, 8)
+#define XGMAC_FTOV BIT(0)
+#define XGMAC_DEF_FTOS (XGMAC_FTOS | XGMAC_FTOV)
+#define XGMAC_DMA_SAFETY_INT_STATUS 0x00003064
+#define XGMAC_MCSIS BIT(31)
+#define XGMAC_MSUIS BIT(29)
+#define XGMAC_MSCIS BIT(28)
+#define XGMAC_DEUIS BIT(1)
+#define XGMAC_DECIS BIT(0)
+#define XGMAC_DMA_ECC_INT_ENABLE 0x00003068
+#define XGMAC_DCEIE BIT(1)
+#define XGMAC_TCEIE BIT(0)
+#define XGMAC_DMA_ECC_INT_STATUS 0x0000306c
+#define XGMAC_DMA_CH_CONTROL(x) (0x00003100 + (0x80 * (x)))
+#define XGMAC_SPH BIT(24)
+#define XGMAC_PBLx8 BIT(16)
+#define XGMAC_DMA_CH_TX_CONTROL(x) (0x00003104 + (0x80 * (x)))
+#define XGMAC_EDSE BIT(28)
+#define XGMAC_TxPBL GENMASK(21, 16)
+#define XGMAC_TxPBL_SHIFT 16
+#define XGMAC_TSE BIT(12)
+#define XGMAC_OSP BIT(4)
+#define XGMAC_TXST BIT(0)
+#define XGMAC_DMA_CH_RX_CONTROL(x) (0x00003108 + (0x80 * (x)))
+#define XGMAC_RxPBL GENMASK(21, 16)
+#define XGMAC_RxPBL_SHIFT 16
+#define XGMAC_RBSZ GENMASK(14, 1)
+#define XGMAC_RBSZ_SHIFT 1
+#define XGMAC_RXST BIT(0)
+#define XGMAC_DMA_CH_TxDESC_HADDR(x) (0x00003110 + (0x80 * (x)))
+#define XGMAC_DMA_CH_TxDESC_LADDR(x) (0x00003114 + (0x80 * (x)))
+#define XGMAC_DMA_CH_RxDESC_HADDR(x) (0x00003118 + (0x80 * (x)))
+#define XGMAC_DMA_CH_RxDESC_LADDR(x) (0x0000311c + (0x80 * (x)))
+#define XGMAC_DMA_CH_TxDESC_TAIL_LPTR(x) (0x00003124 + (0x80 * (x)))
+#define XGMAC_DMA_CH_RxDESC_TAIL_LPTR(x) (0x0000312c + (0x80 * (x)))
+#define XGMAC_DMA_CH_TxDESC_RING_LEN(x) (0x00003130 + (0x80 * (x)))
+#define XGMAC_DMA_CH_RxDESC_RING_LEN(x) (0x00003134 + (0x80 * (x)))
+#define XGMAC_DMA_CH_INT_EN(x) (0x00003138 + (0x80 * (x)))
+#define XGMAC_NIE BIT(15)
+#define XGMAC_AIE BIT(14)
+#define XGMAC_RBUE BIT(7)
+#define XGMAC_RIE BIT(6)
+#define XGMAC_TBUE BIT(2)
+#define XGMAC_TIE BIT(0)
+#define XGMAC_DMA_INT_DEFAULT_EN \
+	(XGMAC_NIE | XGMAC_AIE | XGMAC_RBUE | XGMAC_RIE | XGMAC_TIE)
+#define XGMAC_DMA_INT_DEFAULT_RX (XGMAC_RBUE | XGMAC_RIE)
+#define XGMAC_DMA_INT_DEFAULT_TX (XGMAC_TIE)
+#define XGMAC_DMA_CH_Rx_WATCHDOG(x) (0x0000313c + (0x80 * (x)))
+#define XGMAC_RWT GENMASK(7, 0)
+#define XGMAC_RWTU_SHIFT 12
+#define XGMAC_DMA_CH_TxDESC_CURR_LPTR(x) (0x00003144 + (0x80 * (x)))
+#define XGMAC_DMA_CH_RxDESC_CURR_LPTR(x) (0x0000314c + (0x80 * (x)))
+#define XGMAC_DMA_CH_STATUS(x) (0x00003160 + (0x80 * (x)))
+#define XGMAC_NIS BIT(15)
+#define XGMAC_AIS BIT(14)
+#define XGMAC_FBE BIT(12)
+#define XGMAC_RBU BIT(7)
+#define XGMAC_RI BIT(6)
+#define XGMAC_TBU BIT(2)
+#define XGMAC_TPS BIT(1)
+#define XGMAC_TI BIT(0)
+#define XGMAC_CH_DEBUG_ST(x) (0x00003164 + (0x80 * (x)))
+#define XGMAC_RXDMA_FSM_STATE	0x100000
+#define XGMAC_RXDMA_FSM_STATE_MASK	0xffff0000
+#define XGMAC_TXDMA_FSM_STATE  0x100
+#define XGMAC_TXDMA_FSM_STATE_MASK     0xffff
+#define XGMAC_RDWS			GENMASK(30, 28)
+#define XGMAC_RDWS_SHIFT 28
+#define XGMAC_RDTS GENMASK(27, 19)
+#define XGMAC_RDTS_SHIFT 19
+#define XGMAC_RDFS GENMASK(18, 16)
+#define XGMAC_RDFS_SHIFT 16
+#define XGMAC_TDWS GENMASK(14, 12)
+#define XGMAC_TDWS_SHIFT 12
+#define XGMAC_TDTS GENMASK(11, 8)
+#define XGMAC_TDTS_SHIFT 8
+#define XGMAC_TDRS GENMASK(7, 6)
+#define XGMAC_TDRS_SHIFT 6
+#define XGMAC_TDXS GENMASK(5, 3)
+#define XGMAC_TDXS_SHIFT 3
+#define XGMAC_TDFS GENMASK(2, 0)
+#define XGMAC_TDFS_SHIFT 0
+#define XGMAC_CH_DESC_CACHE_LVL(x) (0x00003168 + (0x80 * (x)))
+#define XGMAC_RXLVL GENMASK(22, 16)
+#define XGMAC_RXLVL_SHIFT 16
+#define XGMAC_TXLVL GENMASK(6, 0)
+#define XGMAC_REGSIZE ((0x0000317c + (0x80 * 15)) / 4)
+#define XGMAC_PER_REGSIZE 0x7c
+#define XGMAC_DMA_STATUS_MSK_COMMON (XGMAC_NIS | XGMAC_AIS | XGMAC_FBE)
+#define XGMAC_DMA_STATUS_MSK_RX \
+	(XGMAC_RBU | XGMAC_RI | XGMAC_DMA_STATUS_MSK_COMMON)
+#define XGMAC_DMA_STATUS_MSK_TX \
+	(XGMAC_TBU | XGMAC_TPS | XGMAC_TI | XGMAC_DMA_STATUS_MSK_COMMON)
+
+/* Descriptors */
+#define XGMAC_TDES0_LTV BIT(31)
+#define XGMAC_TDES0_LT GENMASK(7, 0)
+#define XGMAC_TDES1_LT GENMASK(31, 8)
+#define XGMAC_TDES2_IVT GENMASK(31, 16)
+#define XGMAC_TDES2_IVT_SHIFT 16
+#define XGMAC_TDES2_IOC BIT(31)
+#define XGMAC_TDES2_TTSE BIT(30)
+#define XGMAC_TDES2_B2L GENMASK(29, 16)
+#define XGMAC_TDES2_B2L_SHIFT 16
+#define XGMAC_TDES2_VTIR GENMASK(15, 14)
+#define XGMAC_TDES2_VTIR_SHIFT 14
+#define XGMAC_TDES2_B1L GENMASK(13, 0)
+#define XGMAC_TDES3_OWN BIT(31)
+#define XGMAC_TDES3_CTXT BIT(30)
+#define XGMAC_TDES3_FD BIT(29)
+#define XGMAC_TDES3_LD BIT(28)
+#define XGMAC_TDES3_CPC GENMASK(27, 26)
+#define XGMAC_TDES3_CPC_SHIFT 26
+#define XGMAC_TDES3_TCMSSV BIT(26)
+#define XGMAC_TDES3_SAIC GENMASK(25, 23)
+#define XGMAC_TDES3_SAIC_SHIFT 23
+#define XGMAC_TDES3_TBSV BIT(24)
+#define XGMAC_TDES3_THL GENMASK(22, 19)
+#define XGMAC_TDES3_THL_SHIFT 19
+#define XGMAC_TDES3_IVTIR GENMASK(19, 18)
+#define XGMAC_TDES3_IVTIR_SHIFT 18
+#define XGMAC_TDES3_TSE BIT(18)
+#define XGMAC_TDES3_IVLTV BIT(17)
+#define XGMAC_TDES3_CIC GENMASK(17, 16)
+#define XGMAC_TDES3_CIC_SHIFT 16
+#define XGMAC_TDES3_TPL GENMASK(17, 0)
+#define XGMAC_TDES3_VLTV BIT(16)
+#define XGMAC_TDES3_VT GENMASK(15, 0)
+#define XGMAC_TDES3_FL GENMASK(14, 0)
+#define XGMAC_RDES2_HL GENMASK(9, 0)
+#define XGMAC_RDES0_IVT GENMASK(31, 16)
+#define XGMAC_RDES0_OVT GENMASK(15, 0)
+#define XGMAC_RDES2_L4FM BIT(28)
+#define XGMAC_RDES2_L3FM BIT(27)
+#define XGMAC_RDES2_MADRM GENMASK(26, 19)
+#define XGMAC_RDES2_MADRM_SHIFT (19)
+#define XGMAC_RDES2_TNP BIT(11)
+#define XGMAC_RDES3_OWN BIT(31)
+#define XGMAC_RDES3_CTXT BIT(30)
+#define XGMAC_RDES3_IOC BIT(30)
+#define XGMAC_RDES3_FD BIT(29)
+#define XGMAC_RDES3_LD BIT(28)
+#define XGMAC_RDES3_CDA BIT(27)
+#define XGMAC_RDES3_RSV BIT(26)
+#define XGMAC_RDES3_L34T GENMASK(23, 20)
+#define XGMAC_RDES3_L34T_SHIFT 20
+#define XGMAC_L34T_IP4TCP 0x1
+#define XGMAC_L34T_IP4UDP 0x2
+#define XGMAC_L34T_IP6TCP 0x9
+#define XGMAC_L34T_IP6UDP 0xA
+#define XGMAC_RDES3_ES BIT(15)
+#define XGMAC_RDES3_PL GENMASK(13, 0)
+#define XGMAC_RDES3_TSD BIT(6)
+#define XGMAC_RDES3_TSA BIT(4)
+#define XGMAC_RDES3_COND_SHIFT 16
+#define XGMAC_RDES3_COND GENMASK(19, 16)
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/extern_phy.c b/drivers/net/ethernet/dapustor/dn200/extern_phy.c
new file mode 100644
index 000000000000..078f9d9d300c
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/extern_phy.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *
+ * Driver for PHYs
+ *
+ * Copyright (c) 2024 DapuStor Corporation.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * Support Phys: YT8531
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/phy.h>
+#include <linux/of.h>
+#include <linux/clk.h>
+#include "dn200.h"
+
+#define REG_DEBUG_ADDR_OFFSET           0x1e
+#define REG_DEBUG_DATA                  0x1f
+#define REG_SPECIFIC_STATUS_OFFSET      0x11
+#define SPECIFIC_STATUS_SPEED_MASK      (0x3 << 14)
+#define SPECIFIC_STATUS_SPEED_1000      (0x2 << 14)
+#define SPECIFIC_STATUS_SPEED_100       (0x1 << 14)
+#define SPECIFIC_STATUS_SPEED_10        (0x0 << 14)
+#define SPECIFIC_STATUS_DUPLEX          (0x1 << 13)
+#define SPECIFIC_STATUS_RESOLVED        (0x1 << 11)
+#define SPECIFIC_STATUS_LINK            (0x1 << 10)
+#define SPECIFIC_STATUS_MDI_X           (0x1 << 6)
+#define SPECIFIC_STATUS_TX_PAUSE        (0x1 << 3)
+#define SPECIFIC_STATUS_RX_PAUSE        (0x1 << 2)
+
+#define REG_SPECIFIC_FUNCTION_CONTROL   0x10
+#define SFC_FUNC_CTRL_CROSS_MODE_MASK   (3 << 5)
+#define SFC_FUNC_CTRL_MANUAL_MDI        (0 << 5)
+#define SFC_FUNC_CTRL_MANUAL_MDIX       (1 << 5)
+#define SFC_FUNC_CTRL_AUTO_CROSS        (3 << 5)
+
+static inline void phy_mdio_bus_lock(struct phy_device *phydev)
+{
+	mutex_lock(&phydev->mdio.bus->mdio_lock);
+}
+
+static inline void phy_mdio_bus_unlock(struct phy_device *phydev)
+{
+	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+}
+
+static int phy_addr_get(struct phy_device *phydev)
+{
+	int addr = 0;
+
+	addr = phydev->mdio.addr;
+	return addr;
+}
+
+static struct mii_bus *phy_mii_bus_get(struct phy_device *phydev)
+{
+	struct mii_bus *bus = NULL;
+
+	bus = phydev->mdio.bus;
+
+	return bus;
+}
+
+int ytphy_read_ext(struct phy_device *phydev, u32 regnum)
+{
+	int ret;
+	int addr;
+	struct mii_bus *bus;
+
+	addr = phy_addr_get(phydev);
+	bus = phy_mii_bus_get(phydev);
+
+	phy_mdio_bus_lock(phydev);
+	ret = bus->write(bus, addr, REG_DEBUG_ADDR_OFFSET, regnum);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->read(bus, addr, REG_DEBUG_DATA);
+
+err_handle:
+	phy_mdio_bus_unlock(phydev);
+	return ret;
+}
+
+int ytphy_write_ext(struct phy_device *phydev, u32 regnum, u16 val)
+{
+	int ret;
+	int addr;
+	struct mii_bus *bus;
+
+	addr = phy_addr_get(phydev);
+	bus = phy_mii_bus_get(phydev);
+
+	phy_mdio_bus_lock(phydev);
+	ret = bus->write(bus, addr, REG_DEBUG_ADDR_OFFSET, regnum);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, REG_DEBUG_DATA, val);
+
+err_handle:
+	phy_mdio_bus_unlock(phydev);
+	return ret;
+}
+
+static int ytphy_mmd_read(struct phy_device *phydev, u16 mmd, int regnum)
+{
+	int ret;
+	int addr;
+	struct mii_bus *bus;
+
+	addr = phy_addr_get(phydev);
+	bus = phy_mii_bus_get(phydev);
+
+	phy_mdio_bus_lock(phydev);
+	ret = bus->write(bus, addr, MII_MMD_CTRL, mmd);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, MII_MMD_DATA, regnum);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, MII_MMD_CTRL, mmd | MII_MMD_CTRL_NOINCR);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->read(bus, addr, MII_MMD_DATA);
+
+err_handle:
+	phy_mdio_bus_unlock(phydev);
+	return ret;
+}
+
+static int ytphy_mmd_write(struct phy_device *phydev, u16 mmd, int regnum, u16 val)
+{
+	int ret;
+	int addr;
+	struct mii_bus *bus;
+
+	addr = phy_addr_get(phydev);
+	bus = phy_mii_bus_get(phydev);
+
+	phy_mdio_bus_lock(phydev);
+	ret = bus->write(bus, addr, MII_MMD_CTRL, mmd);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, MII_MMD_DATA, regnum);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, MII_MMD_CTRL, mmd | MII_MMD_CTRL_NOINCR);
+	if (ret < 0)
+		goto err_handle;
+
+	ret = bus->write(bus, addr, MII_MMD_DATA, val);
+
+err_handle:
+	phy_mdio_bus_unlock(phydev);
+	return ret;
+}
+
+void extern_phy_force_led(struct phy_device *phydev, struct dn200_priv *priv, u32 index, u32 mode)
+{
+	u32 val;
+	int ret;
+
+	if (!phydev || index >= 3 || mode >= 3)
+		return;
+
+	ret = ytphy_read_ext(phydev, 0xa00b);
+	if (ret < 0)
+		return;
+
+	val = ret;
+	/* index: 0 -> active; 1 -> 1000M; 2 -> 100M */
+	if (index) {
+		/* index 1 off */
+		val &= ~(0x7 << 3);
+		val |= (priv->plat_ex->hw_rj45_type ? 0x4 : 0x5) << 3;
+		/* index 2 off */
+		val &= ~(0x7 << 6);
+		val |= 0x4 << 6;
+	}
+	val &= ~(0x7 << (index * 3));
+
+	switch (mode) {
+	case 0:
+		/* force on */
+		val |= (index == 2 || priv->plat_ex->hw_rj45_type ? 0x5 : 0x4) << (index * 3);
+		break;
+	case 1:
+		/* force off */
+		val |= (index == 2 || priv->plat_ex->hw_rj45_type ? 0x4 : 0x5) << (index * 3);
+		break;
+	case 2:
+		/* force blink */
+		val |= 0x6 << (index * 3);
+		break;
+	default:
+		val = ret;
+		break;
+	}
+
+	ytphy_write_ext(phydev, 0xa00b, val);
+}
+
+void extern_phy_init(struct phy_device *phydev, u8 hw_type)
+{
+	int val = 0;
+
+	ytphy_write_ext(phydev, 0xa012, 0x00c8);
+	ytphy_write_ext(phydev, 0xa001, 0x8160);
+
+	/* init para */
+	ytphy_write_ext(phydev, 0x52, 0x231d);
+	ytphy_write_ext(phydev, 0x51, 0x04a9);
+	ytphy_write_ext(phydev, 0x57, 0x274c);
+
+	phy_write(phydev, 0, 0x9140);
+
+	/* for rgmii 1.8V */
+	ytphy_write_ext(phydev, 0xa010, 0xabff);
+
+	/* close all led */
+	val = ytphy_read_ext(phydev, 0xa00b);
+	if (val >= 0) {
+		val &= ~0x1ff;
+		val |= hw_type ? 0x124 : 0x12d;
+		ytphy_write_ext(phydev, 0xa00b, val);
+	}
+	/* set rx tx delay time, default 0xf0 */
+	ytphy_write_ext(phydev, 0xa003, 0x04f6);
+
+	/* disable eee */
+	val = ytphy_mmd_read(phydev, MDIO_MMD_AN, MDIO_AN_EEE_ADV);
+	if (val >= 0) {
+		val &= ~(MDIO_AN_EEE_ADV_100TX | MDIO_AN_EEE_ADV_1000T);
+		ytphy_mmd_write(phydev, MDIO_MMD_AN, MDIO_AN_EEE_ADV, val);
+	}
+
+	/* enable stats */
+	ytphy_write_ext(phydev, 0xa0, 0xa8d0);
+}
+
+int extern_phy_read_status(struct phy_device *phydev)
+{
+	int ret;
+	u16 val;
+
+	ret = phy_read(phydev, REG_SPECIFIC_STATUS_OFFSET);
+	if (ret < 0)
+		return ret;
+
+	val = ret;
+
+	phydev->link = val & SPECIFIC_STATUS_LINK ? DN200_LINK_UP : DN200_LINK_DOWN;
+	if (phydev->link == DN200_LINK_DOWN) {
+		phydev->speed = SPEED_UNKNOWN;
+		phydev->duplex = DUPLEX_UNKNOWN;
+		return 0;
+	}
+
+	switch (val & SPECIFIC_STATUS_SPEED_MASK) {
+	case SPECIFIC_STATUS_SPEED_1000:
+		phydev->speed = SPEED_1000;
+		break;
+	case SPECIFIC_STATUS_SPEED_100:
+		phydev->speed = SPEED_100;
+		break;
+	case SPECIFIC_STATUS_SPEED_10:
+		phydev->speed = SPEED_10;
+		break;
+	default:
+		phydev->speed = SPEED_UNKNOWN;
+		break;
+	}
+
+	phydev->duplex = val & SPECIFIC_STATUS_DUPLEX ? DUPLEX_FULL : DUPLEX_HALF;
+
+	return 0;
+}
+
+int extern_phy_pause_autoneg_result(struct phy_device *phydev, bool *tx_pause, bool *rx_pause)
+{
+	int ret;
+	u16 val;
+
+	if (!phydev->link)
+		return -EOPNOTSUPP;
+
+	ret = phy_read(phydev, REG_SPECIFIC_STATUS_OFFSET);
+	if (ret < 0)
+		return ret;
+
+	val = ret;
+
+	*rx_pause = !!(val & SPECIFIC_STATUS_RX_PAUSE);
+	*tx_pause = !!(val & SPECIFIC_STATUS_TX_PAUSE);
+
+	return 0;
+}
+
+int extern_phy_mdix_status_get(struct phy_device *phydev, u8 *mdix, u8 *mdix_ctrl)
+{
+	int ret;
+	u16 val;
+
+	ret = phy_read(phydev, REG_SPECIFIC_STATUS_OFFSET);
+	if (ret < 0)
+		return ret;
+
+	val = ret;
+
+	*mdix = ETH_TP_MDI_INVALID;
+	*mdix_ctrl = ETH_TP_MDI_INVALID;
+
+	if (val & SPECIFIC_STATUS_RESOLVED) {
+		if (val & SPECIFIC_STATUS_MDI_X)
+			*mdix = ETH_TP_MDI_X;
+		else
+			*mdix = ETH_TP_MDI;
+
+		ret = phy_read(phydev, REG_SPECIFIC_FUNCTION_CONTROL);
+		if (ret < 0)
+			return ret;
+
+		val = ret;
+		switch (val & SFC_FUNC_CTRL_CROSS_MODE_MASK) {
+		case SFC_FUNC_CTRL_MANUAL_MDI:
+			*mdix_ctrl = ETH_TP_MDI;
+			break;
+		case SFC_FUNC_CTRL_MANUAL_MDIX:
+			*mdix_ctrl = ETH_TP_MDI_X;
+			break;
+		case SFC_FUNC_CTRL_AUTO_CROSS:
+			*mdix_ctrl = ETH_TP_MDI_AUTO;
+			break;
+		default:
+			*mdix_ctrl = ETH_TP_MDI_INVALID;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+int extern_phy_mdix_status_set(struct phy_device *phydev, u8 ctrl)
+{
+	int ret;
+	u16 val;
+
+	ret = phy_read(phydev, REG_SPECIFIC_FUNCTION_CONTROL);
+	if (ret < 0)
+		return ret;
+
+	val = ret;
+	val &= ~SFC_FUNC_CTRL_CROSS_MODE_MASK;
+
+	switch (ctrl) {
+	case ETH_TP_MDI:
+		val |= SFC_FUNC_CTRL_MANUAL_MDI;
+		break;
+	case ETH_TP_MDI_X:
+		val |= SFC_FUNC_CTRL_MANUAL_MDIX;
+		break;
+	case ETH_TP_MDI_AUTO:
+		val |= SFC_FUNC_CTRL_AUTO_CROSS;
+		break;
+	default:
+		return 0;
+	}
+
+	ret = phy_write(phydev, REG_SPECIFIC_FUNCTION_CONTROL, val);
+	if (ret < 0)
+		return ret;
+
+	ret = phy_read(phydev, MII_BMCR);
+	if (ret < 0)
+		return ret;
+
+	val = ret;
+	return phy_write(phydev, MII_BMCR, val | BMCR_RESET);
+}
+
diff --git a/drivers/net/ethernet/dapustor/dn200/hwif.c b/drivers/net/ethernet/dapustor/dn200/hwif.c
new file mode 100644
index 000000000000..72ed3cf7b46a
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/hwif.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include "common.h"
+#include "dn200.h"
+#include "dn200_ptp.h"
+
+static u32 dn200_get_id(struct dn200_priv *priv, u32 id_reg)
+{
+	u32 reg = readl(priv->ioaddr + id_reg);
+
+	if (!reg) {
+		dev_info(priv->device, "Version ID not available\n");
+		return 0x0;
+	}
+
+	dev_dbg(priv->device, "User ID: 0x%x, Chip ID: 0x%x\n",
+		 (unsigned int)(reg & GENMASK(15, 8)) >> 8,
+		 (unsigned int)(reg & GENMASK(7, 0)));
+	return reg & GENMASK(7, 0);
+}
+
+static u32 dn200_get_dev_id(struct dn200_priv *priv, u32 id_reg)
+{
+	u32 reg = readl(priv->ioaddr + id_reg);
+
+	if (!reg) {
+		dev_info(priv->device, "Version ID not available\n");
+		return 0x0;
+	}
+
+	return (reg & GENMASK(15, 8)) >> 8;
+}
+
+static const struct dn200_hwif_entry {
+	bool gmac;
+	bool gmac4;
+	bool xgmac;
+	bool sriov;
+	u32 min_id;
+	u32 dev_id;
+	const struct dn200_regs_off regs;
+	const void *desc;
+	const void *dma;
+	const void *mac;
+	const void *hwtimestamp;
+	const void *mode;
+	const void *tc;
+	const void *mmc;
+	int (*setup)(struct dn200_priv *priv);
+	int (*quirks)(struct dn200_priv *priv);
+} dn200_hw[] = {
+	/* NOTE: New HW versions shall go to the end of this table */
+	{
+		.gmac = false,
+		.gmac4 = false,
+		.xgmac = true,
+		.sriov = true,
+		.min_id = DWXGMAC_CORE_2_10,
+		.dev_id = DWXGMAC_ID,
+		.regs = {
+			.ptp_off = PTP_XGMAC_OFFSET,
+			.mmc_off = MMC_XGMAC_OFFSET,
+			},
+		.desc = &dwxgmac210_desc_ops,
+		.dma = &dwxgmac_dma_ops,
+		.mac = &dwxgmac_sriov_ops,
+		.hwtimestamp = &dn200_ptp,
+		.mode = NULL,
+		.mmc = &dwxgmac_mmc_ops,
+		.setup = dwxgmac2_setup,
+		.quirks = NULL,
+	},
+	{
+		.gmac = false,
+		.gmac4 = false,
+		.xgmac = true,
+		.min_id = DWXGMAC_CORE_2_10,
+		.dev_id = DWXGMAC_ID,
+		.regs = {
+				.ptp_off = PTP_XGMAC_OFFSET,
+				.mmc_off = MMC_XGMAC_OFFSET,
+			},
+		.desc = &dwxgmac210_desc_ops,
+		.dma = &dwxgmac_dma_ops,
+		.mac = &dwxgmac_purepf_ops,
+		.hwtimestamp = &dn200_ptp,
+		.mode = NULL,
+		.mmc = &dwxgmac_mmc_ops,
+		.setup = dwxgmac2_setup,
+		.quirks = NULL,
+	},
+};
+
+bool dn200_dp_hwif_id_check(void __iomem *ioaddr)
+{
+	static u64 chk_count;
+	static bool pre_chk_state = true;
+
+#define DP_SKIP_CHK_COUNT  100
+	if (chk_count++ % DP_SKIP_CHK_COUNT == 0)
+		pre_chk_state = dn200_hwif_id_check(ioaddr);
+
+	return pre_chk_state;
+}
+
+bool dn200_hwif_id_check(void __iomem *ioaddr)
+{
+	u32 id, dev_id = 0;
+	u32 reg_val = 0;
+
+	reg_val = readl(ioaddr + GMAC4_VERSION);
+	id = reg_val & GENMASK(7, 0);
+	dev_id = (reg_val & GENMASK(15, 8)) >> 8;
+	if (id < DWXGMAC_CORE_2_10 || dev_id != DWXGMAC_ID)
+		return false;
+
+	return true;
+}
+
+int dn200_hwif_init(struct dn200_priv *priv)
+{
+	bool needs_xgmac = priv->plat->has_xgmac;
+	bool needs_gmac4 = priv->plat->has_gmac4;
+	bool needs_gmac = priv->plat->has_gmac;
+	bool needs_sriov = PRIV_SRIOV_SUPPORT(priv) | PRIV_IS_VF(priv);
+	const struct dn200_hwif_entry *entry;
+	struct mac_device_info *mac;
+	bool needs_setup = true;
+	u32 id, dev_id = 0;
+	int i, ret;
+
+	if (needs_gmac) {
+		id = dn200_get_id(priv, GMAC_VERSION);
+	} else if (needs_gmac4 || needs_xgmac) {
+		id = dn200_get_id(priv, GMAC4_VERSION);
+		if (needs_xgmac)
+			dev_id = dn200_get_dev_id(priv, GMAC4_VERSION);
+	} else {
+		id = 0;
+	}
+
+	/* Save ID for later use */
+	priv->chip_id = id;
+
+	/* Lets assume some safe values first */
+	priv->ptpaddr = priv->ioaddr +
+	    (needs_gmac4 ? PTP_GMAC4_OFFSET : PTP_GMAC3_X_OFFSET);
+	priv->mmcaddr = priv->ioaddr +
+	    (needs_gmac4 ? MMC_GMAC4_OFFSET : MMC_GMAC3_X_OFFSET);
+
+	/* Check for HW specific setup first */
+	if (priv->plat->setup) {
+		mac = priv->plat->setup(priv);
+		needs_setup = false;
+	} else {
+		mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
+	}
+
+	if (!mac)
+		return -ENOMEM;
+	/* Fallback to generic HW */
+	for (i = ARRAY_SIZE(dn200_hw) - 1; i >= 0; i--) {
+		entry = &dn200_hw[i];
+		if (needs_gmac ^ entry->gmac)
+			continue;
+		if (needs_gmac4 ^ entry->gmac)
+			continue;
+		if (needs_xgmac ^ entry->xgmac)
+			continue;
+		if (needs_sriov ^ entry->sriov)
+			continue;
+		/* Use chip_id var because some setups can override this */
+		if (priv->chip_id < entry->min_id)
+			continue;
+		if (needs_xgmac && (dev_id ^ entry->dev_id))
+			continue;
+
+		/* Only use generic HW helpers if needed */
+		mac->desc = mac->desc ? : entry->desc;
+		mac->dma = mac->dma ? : entry->dma;
+		mac->mac = mac->mac ? : entry->mac;
+		mac->ptp = mac->ptp ? : entry->hwtimestamp;
+		mac->mode = mac->mode ? : entry->mode;
+		mac->tc = mac->tc ? : entry->tc;
+		mac->mmc = mac->mmc ? : entry->mmc;
+
+		mac->priv = priv;
+		priv->hw = mac;
+		priv->ptpaddr = priv->ioaddr + entry->regs.ptp_off;
+		priv->mmcaddr = priv->ioaddr + entry->regs.mmc_off;
+
+		/* Entry found */
+		if (needs_setup) {
+			ret = entry->setup(priv);
+			if (ret)
+				return ret;
+		}
+
+		/* Save quirks, if needed for posterior use */
+		priv->hwif_quirks = entry->quirks;
+		return 0;
+	}
+
+	dev_err(priv->device, "Failed to find HW IF (id=0x%x, gmac=%d/%d)\n",
+		id, needs_gmac, needs_gmac4);
+	return -EINVAL;
+}
diff --git a/drivers/net/ethernet/dapustor/dn200/hwif.h b/drivers/net/ethernet/dapustor/dn200/hwif.h
new file mode 100644
index 000000000000..ca185d719e6f
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/hwif.h
@@ -0,0 +1,778 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __DN200_HWIF_H__
+#define __DN200_HWIF_H__
+
+#include <linux/netdevice.h>
+#include "dn200_cfg.h"
+#include "common.h"
+#define dn200_do_void_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) { \
+		(__priv)->hw->__module->__cname((__arg0), ##__args); \
+		__result = 0; \
+	} \
+	__result; \
+})
+#define dn200_do_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) \
+		__result = (__priv)->hw->__module->__cname((__arg0), ##__args); \
+	__result; \
+})
+
+struct dn200_extra_stats;
+struct dn200_safety_stats;
+struct dma_desc;
+struct dn200_fdir_filter;
+
+/* Descriptors helpers */
+struct dn200_desc_ops {
+	/* DMA RX descriptor ring initialization */
+	void (*init_rx_desc)(struct dma_desc *p, int disable_rx_ic, int mode,
+			     int end, int bfsize);
+	/* DMA TX descriptor ring initialization */
+	void (*init_tx_desc)(struct dma_desc *p, int mode, int end);
+	/* Invoked by the xmit function to prepare the tx descriptor */
+	void (*prepare_tx_desc)(struct dma_desc *p, int is_fs, int len,
+				bool csum_flag, int mode, bool tx_own, bool ls,
+				unsigned int tot_pkt_len);
+	void (*prepare_tso_tx_desc)(struct dma_desc *p, int is_fs, int len1,
+				    int len2, bool tx_own, bool ls,
+				    unsigned int tcphdrlen,
+				    unsigned int tcppayloadlen);
+	/* Set/get the owner of the descriptor */
+	void (*set_tx_owner)(struct dma_desc *p);
+	int (*get_tx_owner)(struct dma_desc *p);
+	/* Clean the tx descriptor as soon as the tx irq is received */
+	void (*release_tx_desc)(struct dma_desc *p, int mode);
+	/* Clear interrupt on tx frame completion. When this bit is
+	 * set an interrupt happens as soon as the frame is transmitted
+	 */
+	void (*set_tx_ic)(struct dma_desc *p);
+	/* Last tx segment reports the transmit status */
+	int (*get_tx_ls)(struct dma_desc *p);
+	/* Return the transmit status looking at the TDES1 */
+	int (*tx_status)(void *data, struct dn200_extra_stats *x,
+			 struct dma_desc *p, void __iomem *ioaddr);
+	/* Get the buffer size from the descriptor */
+	int (*get_tx_len)(struct dma_desc *p);
+	/* Handle extra events on specific interrupts hw dependent */
+	void (*set_rx_owner)(struct dma_desc *p, int disable_rx_ic);
+	/* Get the receive frame size */
+	int (*get_rx_frame_len)(struct dma_desc *p, int rx_coe_type);
+	/* Return the reception status looking at the RDES1 */
+	int (*rx_status)(void *data, struct dn200_extra_stats *x,
+			 struct dma_desc *p, bool rec_all);
+	/* Set tx timestamp enable bit */
+	void (*enable_tx_timestamp)(struct dma_desc *p);
+	/* get tx timestamp status */
+	int (*get_tx_timestamp_status)(struct dma_desc *p);
+	/* get timestamp value */
+	void (*get_timestamp)(void *desc, u32 ats, u64 *ts);
+	/* get rx timestamp status */
+	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
+	/* Display ring */
+	void (*display_ring)(void *head, unsigned int size, bool not_tbl,
+			     dma_addr_t dma_rx_phy, unsigned int desc_size,
+			     struct mac_device_info *hw);
+	/* set MSS via context descriptor */
+	void (*set_mss)(struct dma_desc *p, unsigned int mss);
+	/* get descriptor skbuff address */
+	void (*get_addr)(struct dma_desc *p, unsigned int *addr);
+	/* set descriptor skbuff address */
+	void (*set_addr)(struct dma_desc *p, dma_addr_t addr,
+			 struct mac_device_info *hw);
+	/* clear descriptor */
+	void (*clear)(struct dma_desc *p);
+	/* RSS */
+	int (*get_rx_hash)(struct dma_desc *p, u32 *hash,
+			   enum pkt_hash_types *type);
+	void (*get_rx_header_len)(struct dma_desc *p, unsigned int *len);
+	void (*set_sec_addr)(struct dma_desc *p, dma_addr_t addr,
+			     bool buf2_valid, struct mac_device_info *hw);
+	void (*set_sarc)(struct dma_desc *p, u32 sarc_type);
+	void (*set_vlan_tag)(struct dma_desc *p, u16 tag, u16 inner_tag,
+			     u32 inner_type);
+	void (*set_vlan)(struct dma_desc *p, u32 type);
+	int (*get_ovt)(struct dma_desc *p);
+	void (*set_vxlan)(struct dma_desc *p);
+};
+
+#define dn200_init_rx_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, init_rx_desc, __args)
+#define dn200_init_tx_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, init_tx_desc, __args)
+#define dn200_prepare_tx_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, prepare_tx_desc, __args)
+#define dn200_prepare_tso_tx_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, prepare_tso_tx_desc, __args)
+#define dn200_set_tx_owner(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_tx_owner, __args)
+#define dn200_get_tx_owner(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_tx_owner, __args)
+#define dn200_release_tx_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, release_tx_desc, __args)
+#define dn200_set_tx_ic(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_tx_ic, __args)
+#define dn200_get_tx_ls(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_tx_ls, __args)
+#define dn200_tx_status(__priv, __args...) \
+	dn200_do_callback(__priv, desc, tx_status, __args)
+#define dn200_get_tx_len(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_tx_len, __args)
+#define dn200_set_rx_owner(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_rx_owner, __args)
+#define dn200_get_rx_frame_len(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_rx_frame_len, __args)
+#define dn200_rx_status(__priv, __args...) \
+	dn200_do_callback(__priv, desc, rx_status, __args)
+#define dn200_enable_tx_timestamp(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, enable_tx_timestamp, __args)
+#define dn200_get_tx_timestamp_status(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_tx_timestamp_status, __args)
+#define dn200_get_timestamp(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, get_timestamp, __args)
+#define dn200_get_rx_timestamp_status(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_rx_timestamp_status, __args)
+#define dn200_display_ring(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, display_ring, __args)
+#define dn200_set_mss(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_mss, __args)
+#define dn200_get_desc_addr(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, get_addr, __args)
+#define dn200_set_desc_addr(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_addr, __args)
+#define dn200_clear_desc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, clear, __args)
+#define dn200_get_rx_hash(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_rx_hash, __args)
+#define dn200_get_rx_header_len(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, get_rx_header_len, __args)
+#define dn200_set_desc_sec_addr(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_sec_addr, __args)
+#define dn200_set_desc_sarc(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_sarc, __args)
+#define dn200_set_desc_vlan_tag(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_vlan_tag, __args)
+#define dn200_set_desc_vlan(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_vlan, __args)
+#define dn200_get_ovt(__priv, __args...) \
+	dn200_do_callback(__priv, desc, get_ovt, __args)
+#define dn200_set_vxlan(__priv, __args...) \
+	dn200_do_void_callback(__priv, desc, set_vxlan, __args)
+struct dn200_dma_cfg;
+struct dma_features;
+
+/* Specific DMA helpers */
+struct dn200_dma_ops {
+	/* DMA core initialization */
+	int (*reset)(void __iomem *ioaddr, struct mac_device_info *hw);
+	void (*init)(void __iomem *ioaddr, struct dn200_dma_cfg *dma_cfg,
+		     int atds, struct mac_device_info *hw);
+	void (*init_chan)(void __iomem *ioaddr,
+			  struct dn200_dma_cfg *dma_cfg, u32 chan,
+			  struct mac_device_info *hw);
+	void (*init_rx_chan)(void __iomem *ioaddr,
+			     struct dn200_dma_cfg *dma_cfg, dma_addr_t phy,
+			     u32 chan, struct mac_device_info *hw);
+	void (*init_tx_chan)(void __iomem *ioaddr,
+			     struct dn200_dma_cfg *dma_cfg, dma_addr_t phy,
+			     u32 chan, struct mac_device_info *hw);
+	/* Configure the AXI Bus Mode Register */
+	void (*axi)(void __iomem *ioaddr, struct dn200_axi *axi,
+		    struct mac_device_info *hw);
+	/* Dump DMA registers */
+	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
+	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode, struct mac_device_info *hw);
+	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode, u8 tc,
+			    struct mac_device_info *hw);
+	void (*dma_rx_all_set)(void __iomem *ioaddr, u32 channel, u8 enable,
+			       struct mac_device_info *hw);
+	void (*dma_mode_reset)(void __iomem *ioaddr, u8 channel,
+			       struct mac_device_info *hw);
+	/* To track extra statistic (if supported) */
+	void (*dma_diagnostic_fr)(void *data, struct dn200_extra_stats *x,
+				  void __iomem *ioaddr,
+				  struct mac_device_info *hw);
+	void (*enable_dma_transmission)(void __iomem *ioaddr);
+	void (*enable_dma_irq)(void __iomem *ioaddr, u32 chan,
+			       bool rx, bool tx, struct mac_device_info *hw);
+	void (*disable_dma_irq)(void __iomem *ioaddr, u32 chan,
+				bool rx, bool tx, struct mac_device_info *hw);
+	void (*start_tx)(void __iomem *ioaddr, u32 chan,
+			 struct mac_device_info *hw);
+	void (*stop_tx)(void __iomem *ioaddr, u32 chan,
+			struct mac_device_info *hw);
+	void (*start_rx)(void __iomem *ioaddr, u32 chan,
+			 struct mac_device_info *hw);
+	void (*stop_rx)(void __iomem *ioaddr, u32 chan,
+			struct mac_device_info *hw);
+	int (*dma_interrupt)(void __iomem *ioaddr,
+			     struct dn200_extra_stats *x, u32 chan, u32 dir,
+			     struct mac_device_info *hw);
+	/* If supported then get the optional core features */
+	int (*get_hw_feature)(void __iomem *ioaddr,
+			      struct dma_features *dma_cap);
+	/* Program the HW RX Watchdog */
+	void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 queue,
+			    struct mac_device_info *hw);
+	void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan,
+				struct mac_device_info *hw);
+	void (*set_rx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan,
+				struct mac_device_info *hw);
+	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan,
+				struct mac_device_info *hw);
+	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan,
+				struct mac_device_info *hw);
+	u32 (*get_rx_curr_ptr)(void __iomem *ioaddr, u32 chan,
+				struct mac_device_info *hw);
+	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan,
+			   struct mac_device_info *hw);
+	void (*qmode)(void __iomem *ioaddr, u32 channel, u8 qmode,
+		      struct mac_device_info *hw);
+	void (*set_bfsize)(void __iomem *ioaddr, int bfsize, u32 chan,
+			   struct mac_device_info *hw);
+	void (*enable_sph)(void __iomem *ioaddr, bool en, u32 chan,
+			   struct mac_device_info *hw);
+	int (*enable_tbs)(void __iomem *ioaddr, bool en, u32 chan,
+			  struct mac_device_info *hw);
+	void (*dma_reset_chan)(void __iomem *ioaddr, u32 chan,
+			       struct mac_device_info *hw);
+	int (*check_chan_status)(void __iomem *ioaddr, u32 chan,
+				 struct mac_device_info *hw, bool is_tx);
+};
+
+#define dn200_dma_reset(__priv, __args...) \
+	dn200_do_callback(__priv, dma, reset, __args)
+#define dn200_dma_init(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, init, __args)
+#define dn200_init_chan(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, init_chan, __args)
+#define dn200_init_rx_chan(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, init_rx_chan, __args)
+#define dn200_init_tx_chan(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, init_tx_chan, __args)
+#define dn200_axi(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, axi, __args)
+#define dn200_dump_dma_regs(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dump_regs, __args)
+#define dn200_dma_rx_mode(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_rx_mode, __args)
+#define dn200_dma_tx_mode(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_tx_mode, __args)
+#define dn200_dma_rx_all_set(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_rx_all_set, __args)
+#define dn200_dma_mode_reset(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_mode_reset, __args)
+#define dn200_dma_diagnostic_fr(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_diagnostic_fr, __args)
+#define dn200_enable_dma_transmission(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, enable_dma_transmission, __args)
+#define dn200_enable_dma_irq(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, enable_dma_irq, __args)
+#define dn200_disable_dma_irq(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, disable_dma_irq, __args)
+#define dn200_start_tx(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, start_tx, __args)
+#define dn200_stop_tx(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, stop_tx, __args)
+#define dn200_start_rx(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, start_rx, __args)
+#define dn200_stop_rx(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, stop_rx, __args)
+#define dn200_dma_interrupt_status(__priv, __args...) \
+	dn200_do_callback(__priv, dma, dma_interrupt, __args)
+#define dn200_get_hw_feature(__priv, __args...) \
+	dn200_do_callback(__priv, dma, get_hw_feature, __args)
+#define dn200_rx_watchdog(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, rx_watchdog, __args)
+#define dn200_set_tx_ring_len(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, set_tx_ring_len, __args)
+#define dn200_set_rx_ring_len(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, set_rx_ring_len, __args)
+#define dn200_set_rx_tail_ptr(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, set_rx_tail_ptr, __args)
+#define dn200_set_tx_tail_ptr(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, set_tx_tail_ptr, __args)
+#define dn200_get_rx_curr_ptr(__priv, __args...) \
+	dn200_do_callback(__priv, dma, get_rx_curr_ptr, __args)
+#define dn200_enable_tso(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, enable_tso, __args)
+#define dn200_dma_qmode(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, qmode, __args)
+#define dn200_set_dma_bfsize(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, set_bfsize, __args)
+#define dn200_enable_sph(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, enable_sph, __args)
+#define dn200_enable_tbs(__priv, __args...) \
+	dn200_do_callback(__priv, dma, enable_tbs, __args)
+#define dn200_dma_reset_chan(__priv, __args...) \
+	dn200_do_void_callback(__priv, dma, dma_reset_chan, __args)
+#define dn200_check_chan_status(__priv, __args...) \
+	dn200_do_callback(__priv, dma, check_chan_status, __args)
+
+struct mac_device_info;
+struct net_device;
+struct rgmii_adv;
+struct dn200_tc_entry;
+struct dn200_pps_cfg;
+struct dn200_rss;
+struct dn200_est;
+
+/* Helpers to program the MAC core */
+struct dn200_ops {
+	/* MAC core initialization */
+	void (*core_init)(struct mac_device_info *hw, struct net_device *dev);
+	/* Enable the MAC RX/TX */
+	void (*set_mac)(void __iomem *ioaddr, bool enable,
+			struct mac_device_info *hw);
+	/* Enable the MAC RX */
+	void (*set_mac_rx)(void __iomem *ioaddr, bool enable);
+	/* Get the MAC RX */
+	int (*get_mac_rx)(void __iomem *ioaddr);
+	/* Enable and verify that the IPC module is supported */
+	int (*rx_ipc)(struct mac_device_info *hw);
+	/* Enable RX Queues */
+	void (*rx_queue_enable)(struct mac_device_info *hw, u8 mode,
+				u32 queue);
+	/* Disable RX Queues */
+	void (*rx_queue_disable)(struct mac_device_info *hw, u32 queue);
+	/* RX Queues Priority */
+	void (*rx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* TX Queues Priority */
+	void (*tx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* RX Queues Routing */
+	void (*rx_queue_routing)(struct mac_device_info *hw, u8 packet,
+				 u32 queue);
+	/* Program RX Algorithms */
+	void (*prog_mtl_rx_algorithms)(struct mac_device_info *hw, u32 rx_alg);
+	/* Program TX Algorithms */
+	void (*prog_mtl_tx_algorithms)(struct mac_device_info *hw, u32 tx_alg);
+	/* Set MTL TX queues weight */
+	void (*set_mtl_tx_queue_weight)(struct mac_device_info *hw,
+					u32 weight, u32 queue);
+	/* Set MTL RX queues weight */
+	void (*set_mtl_rx_queue_weight)(struct mac_device_info *hw,
+					u32 weight, u32 queue);
+	/* RX MTL queue to RX dma mapping */
+	void (*map_mtl_to_dma)(struct mac_device_info *hw, u32 queue,
+			       u32 chan);
+	/* Set RX MTL queue to RX dma mapping as dynamic selection */
+	void (*mtl_dynamic_chan_set)(struct mac_device_info *hw,
+				     u32 queue, bool dynamic);
+	/* Configure AV Algorithm */
+	void (*config_cbs)(struct mac_device_info *hw, u32 send_slope,
+			   u32 idle_slope, u32 high_credit, u32 low_credit,
+			   u32 queue);
+	/* Dump MAC registers */
+	void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
+	/* Handle extra events on specific interrupts hw dependent */
+	int (*host_irq_status)(struct mac_device_info *hw,
+			       struct dn200_extra_stats *x);
+	/* Handle MTL interrupts */
+	int (*host_mtl_irq_status)(struct mac_device_info *hw, u32 chan);
+	/* Multicast filter setting */
+	void (*set_filter)(struct mac_device_info *hw,
+			   struct net_device *dev, u8 *wakeup_wq);
+	void (*wq_set_filter)(struct mac_device_info *hw,
+				      struct net_device *dev, bool is_vf,
+					  struct dn200_vf_rxp_async_info *async_info);
+	/* Flow control setting */
+	void (*flow_ctrl)(struct mac_device_info *hw, unsigned int duplex,
+			  unsigned int fc, unsigned int pause_time, u32 tx_cnt);
+	/* Set/Get Unicast MAC addresses */
+	int (*set_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
+			      unsigned int reg_n, u8 *wakeup_wq);
+	int (*wq_set_umac_addr)(struct mac_device_info *hw,
+				   unsigned char *addr, unsigned int reg_n,
+				   struct dn200_vf_rxp_async_info *async_info);
+	void (*get_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
+			      unsigned int reg_n);
+	void (*set_eee_mode)(struct mac_device_info *hw,
+			     bool en_tx_lpi_clockgating, bool en_tx_lpi_auto_timer);
+	void (*reset_eee_mode)(struct mac_device_info *hw);
+	void (*set_eee_lpi_entry_timer)(struct mac_device_info *hw, int et);
+	void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
+	void (*set_eee_pls)(struct mac_device_info *hw, int link);
+	void (*debug)(void __iomem *ioaddr, struct dn200_extra_stats *x,
+		      u32 rx_queues, u32 tx_queues);
+	/* PCS calls */
+	void (*pcs_ctrl_ane)(void __iomem *ioaddr, bool ane, bool srgmi_ral,
+			     bool loopback);
+	void (*pcs_rane)(void __iomem *ioaddr, bool restart);
+	void (*pcs_get_adv_lp)(void __iomem *ioaddr, struct rgmii_adv *adv);
+	/* Safety Features */
+	int (*safety_feat_config)(void __iomem *ioaddr, unsigned int asp,
+				  struct dn200_safety_feature_cfg *safety_cfg,
+				  struct mac_device_info *hw);
+	int (*safety_feat_irq_status)(struct net_device *ndev,
+				      void __iomem *ioaddr, unsigned int asp,
+				      struct dn200_safety_stats *stats);
+	int (*safety_feat_dump)(struct dn200_safety_stats *stats, int index,
+				unsigned long *count, const char **desc);
+	/* Flexible RX Parser */
+	int (*rxp_config)(struct mac_device_info *hw,
+			  struct dn200_tc_entry *entries, unsigned int count);
+	/* Flexible PPS */
+	int (*flex_pps_config)(void __iomem *ioaddr, int index,
+			       struct dn200_pps_cfg *cfg, bool enable,
+			       u32 sub_second_inc, u32 systime_flags);
+	/* Loopback for selftests */
+	int (*set_mac_loopback)(void __iomem *ioaddr, bool enable);
+	/* RSS */
+	int (*rss_configure)(struct mac_device_info *hw,
+			     struct dn200_rss *cfg, u32 num_rxq);
+	/* VLAN */
+	void (*update_vlan_hash)(struct mac_device_info *hw, u32 hash,
+				 __le16 perfect_match, bool is_double);
+	void (*enable_vlan)(struct mac_device_info *hw, u32 type);
+	void (*init_hw_vlan_rx_fltr)(struct mac_device_info *hw);
+	int (*add_hw_vlan_rx_fltr)(struct net_device *dev,
+				   struct mac_device_info *hw,
+				   __be16 proto, u16 vid, uint8_t off,
+				   bool is_last);
+	int (*del_hw_vlan_rx_fltr)(struct net_device *dev,
+				   struct mac_device_info *hw, __be16 proto,
+				   u16 vid, uint8_t off, bool is_last);
+	void (*config_vlan_rx_fltr)(struct mac_device_info *hw, bool enable);
+	void (*rx_vlan_stripping_config)(struct mac_device_info *hw,
+					 bool enable);
+	void (*restore_hw_vlan_rx_fltr)(struct net_device *dev,
+					struct mac_device_info *hw);
+	/* TX Timestamp */
+	int (*get_mac_tx_timestamp)(struct mac_device_info *hw, u64 *ts);
+	/* Source Address Insertion / Replacement */
+	void (*sarc_configure)(void __iomem *ioaddr, int val);
+	/* Filtering */
+	int (*config_l3_filter)(struct mac_device_info *hw, u32 filter_no,
+				bool en, bool ipv6, bool sa, bool inv,
+				u32 match);
+	int (*config_ntuple_filter)(struct mac_device_info *hw, u32 filter_no,
+				    struct dn200_fdir_filter *input, bool en);
+	int (*config_l4_filter)(struct mac_device_info *hw, u32 filter_no,
+				bool en, bool udp, bool sa, bool inv,
+				u32 match);
+	void (*l3_l4_filter_config)(struct mac_device_info *hw, bool en);
+	void (*set_arp_offload)(struct mac_device_info *hw, bool en, u32 addr);
+	int (*est_configure)(void __iomem *ioaddr, struct dn200_est *cfg,
+			     unsigned int ptp_rate);
+	void (*est_irq_status)(void __iomem *ioaddr, struct net_device *dev,
+			       struct dn200_extra_stats *x, u32 txqcnt);
+	void (*fpe_configure)(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
+			      bool enable);
+	void (*fpe_send_mpacket)(void __iomem *ioaddr,
+				 enum dn200_mpacket_type type);
+	int (*fpe_irq_status)(void __iomem *ioaddr, struct net_device *dev);
+	void (*rx_dds_config)(struct mac_device_info *hw, bool enable);
+	int (*rxp_broadcast)(struct mac_device_info *hw);
+	void (*rxp_filter_get)(struct mac_device_info *hw,
+			       struct seq_file *seq);
+	void (*rxp_clear)(struct mac_device_info *hw);
+	void (*vf_del_rxp)(struct mac_device_info *hw);
+	void (*wq_vf_del_rxp)(struct mac_device_info *hw, int offset, u8 rxq_start);
+	void (*clear_vf_rxp)(struct mac_device_info *hw, u8 vf_off);
+	void (*vf_append_rxp_bc)(struct mac_device_info *hw, u16 channel);
+	void (*mtl_reset)(struct mac_device_info *hw, u32 queue, u32 chan,
+			  u8 mode);
+	int (*tx_queue_flush)(struct mac_device_info *hw, u32 queue);
+	int (*reset_rxp)(struct mac_device_info *hw);
+	int (*rxf_and_acl_mem_reset)(struct mac_device_info *hw);
+};
+
+#define dn200_core_init(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, core_init, __args)
+#define dn200_mac_set(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_mac, __args)
+#define dn200_mac_rx_set(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_mac_rx, __args)
+#define dn200_mac_rx_get(__priv, __args...) \
+	dn200_do_callback(__priv, mac, get_mac_rx, __args)
+#define dn200_rx_ipc(__priv, __args...) \
+	dn200_do_callback(__priv, mac, rx_ipc, __args)
+#define dn200_rx_queue_enable(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_queue_enable, __args)
+#define dn200_rx_queue_disable(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_queue_disable, __args)
+#define dn200_rx_dds_config(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_dds_config, __args)
+#define dn200_rx_queue_prio(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_queue_prio, __args)
+#define dn200_tx_queue_prio(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, tx_queue_prio, __args)
+#define dn200_rx_queue_routing(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_queue_routing, __args)
+#define dn200_prog_mtl_rx_algorithms(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, prog_mtl_rx_algorithms, __args)
+#define dn200_prog_mtl_tx_algorithms(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, prog_mtl_tx_algorithms, __args)
+#define dn200_set_mtl_tx_queue_weight(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_mtl_tx_queue_weight, __args)
+#define dn200_set_mtl_rx_queue_weight(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_mtl_rx_queue_weight, __args)
+#define dn200_map_mtl_to_dma(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, map_mtl_to_dma, __args)
+#define dn200_mtl_dynamic_chan_set(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, mtl_dynamic_chan_set, __args)
+#define dn200_config_cbs(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, config_cbs, __args)
+#define dn200_dump_mac_regs(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, dump_regs, __args)
+#define dn200_host_irq_status(__priv, __args...) \
+	dn200_do_callback(__priv, mac, host_irq_status, __args)
+#define dn200_host_mtl_irq_status(__priv, __args...) \
+	dn200_do_callback(__priv, mac, host_mtl_irq_status, __args)
+#define dn200_set_filter(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_filter, __args)
+#define dn200_wq_set_filter(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, wq_set_filter, __args)
+#define dn200_flow_ctrl(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, flow_ctrl, __args)
+#define dn200_set_umac_addr(__priv, __args...) \
+	dn200_do_callback(__priv, mac, set_umac_addr, __args)
+#define dn200_wq_set_umac_addr(__priv, __args...) \
+	dn200_do_callback(__priv, mac, wq_set_umac_addr, __args)
+#define dn200_get_umac_addr(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, get_umac_addr, __args)
+#define dn200_set_eee_mode(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_eee_mode, __args)
+#define dn200_reset_eee_mode(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, reset_eee_mode, __args)
+#define dn200_set_eee_lpi_timer(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_eee_lpi_entry_timer, __args)
+#define dn200_set_eee_timer(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_eee_timer, __args)
+#define dn200_set_eee_pls(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_eee_pls, __args)
+#define dn200_mac_debug(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, debug, __args)
+#define dn200_pcs_ctrl_ane(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, pcs_ctrl_ane, __args)
+#define dn200_pcs_rane(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, pcs_rane, __args)
+#define dn200_pcs_get_adv_lp(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, pcs_get_adv_lp, __args)
+#define dn200_safety_feat_config(__priv, __args...) \
+	dn200_do_callback(__priv, mac, safety_feat_config, __args)
+#define dn200_safety_feat_irq_status(__priv, __args...) \
+	dn200_do_callback(__priv, mac, safety_feat_irq_status, __args)
+#define dn200_safety_feat_dump(__priv, __args...) \
+	dn200_do_callback(__priv, mac, safety_feat_dump, __args)
+#define dn200_rxp_config(__priv, __args...) \
+	dn200_do_callback(__priv, mac, rxp_config, __args)
+#define dn200_flex_pps_config(__priv, __args...) \
+	dn200_do_callback(__priv, mac, flex_pps_config, __args)
+#define dn200_set_mac_loopback(__priv, __args...) \
+	dn200_do_callback(__priv, mac, set_mac_loopback, __args)
+#define dn200_rss_configure(__priv, __args...) \
+	dn200_do_callback(__priv, mac, rss_configure, __args)
+#define dn200_update_vlan_hash(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, update_vlan_hash, __args)
+#define dn200_enable_vlan(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, enable_vlan, __args)
+#define dn200_add_hw_vlan_rx_fltr(__priv, __args...) \
+	dn200_do_callback(__priv, mac, add_hw_vlan_rx_fltr, __args)
+#define dn200_del_hw_vlan_rx_fltr(__priv, __args...) \
+	dn200_do_callback(__priv, mac, del_hw_vlan_rx_fltr, __args)
+#define dn200_restore_hw_vlan_rx_fltr(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, restore_hw_vlan_rx_fltr, __args)
+#define dn200_get_mac_tx_timestamp(__priv, __args...) \
+	dn200_do_callback(__priv, mac, get_mac_tx_timestamp, __args)
+#define dn200_sarc_configure(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, sarc_configure, __args)
+#define dn200_config_l3_filter(__priv, __args...) \
+	dn200_do_callback(__priv, mac, config_l3_filter, __args)
+#define dn200_config_l4_filter(__priv, __args...) \
+	dn200_do_callback(__priv, mac, config_l4_filter, __args)
+#define dn200_config_ntuple_filter(__priv, __args...) \
+	dn200_do_callback(__priv, mac, config_ntuple_filter, __args)
+#define dn200_l3_l4_filter_config(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, l3_l4_filter_config, __args)
+#define dn200_set_arp_offload(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, set_arp_offload, __args)
+#define dn200_est_configure(__priv, __args...) \
+	dn200_do_callback(__priv, mac, est_configure, __args)
+#define dn200_est_irq_status(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, est_irq_status, __args)
+#define dn200_fpe_configure(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, fpe_configure, __args)
+#define dn200_fpe_send_mpacket(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, fpe_send_mpacket, __args)
+#define dn200_fpe_irq_status(__priv, __args...) \
+	dn200_do_callback(__priv, mac, fpe_irq_status, __args)
+#define dn200_rxp_broadcast(__priv, __args...) \
+	dn200_do_callback(__priv, mac, rxp_broadcast, __args)
+#define dn200_rxp_filter_get(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rxp_filter_get, __args)
+#define dn200_init_hw_vlan_rx_fltr(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, init_hw_vlan_rx_fltr, __args)
+#define dn200_config_vlan_rx_fltr(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, config_vlan_rx_fltr, __args)
+#define dn200_rx_vlan_stripping_config(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rx_vlan_stripping_config, __args)
+#define dn200_rxp_clear(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, rxp_clear, __args)
+#define dn200_vf_del_rxp(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, vf_del_rxp, __args)
+#define dn200_wq_vf_del_rxp(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, wq_vf_del_rxp, __args)
+#define dn200_clear_vf_rxp(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, clear_vf_rxp, __args)
+#define dn200_vf_append_rxp_bc(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, vf_append_rxp_bc, __args)
+#define dn200_mtl_reset(__priv, __args...) \
+	dn200_do_void_callback(__priv, mac, mtl_reset, __args)
+#define dn200_tx_queue_flush(__priv, __args...) \
+	dn200_do_callback(__priv, mac, tx_queue_flush, __args)
+#define dn200_reset_rxp(__priv, __args...) \
+	dn200_do_callback(__priv, mac, reset_rxp, __args)
+#define dn200_rxf_and_acl_mem_reset(__priv, __args...) \
+	dn200_do_callback(__priv, mac, rxf_and_acl_mem_reset, __args)
+struct dn200_priv;
+
+/* PTP and HW Timer helpers */
+struct dn200_hwtimestamp {
+	void (*config_hw_tstamping)(void __iomem *ioaddr, u32 data);
+	void (*config_sub_second_increment)(void __iomem *ioaddr,
+					    u32 ptp_clock, int gmac4,
+					    u32 *ssinc);
+	int (*init_systime)(void __iomem *ioaddr, u32 sec, u32 nsec);
+	int (*config_addend)(void __iomem *ioaddr, u32 addend);
+	int (*adjust_systime)(void __iomem *ioaddr, u32 sec, u32 nsec,
+			      int add_sub, int gmac4);
+	void (*get_systime)(void __iomem *ioaddr, u64 *systime);
+	void (*get_ptptime)(void __iomem *ioaddr, u64 *ptp_time);
+	void (*timestamp_interrupt)(struct dn200_priv *priv);
+};
+
+#define dn200_config_hw_tstamping(__priv, __args...) \
+	dn200_do_void_callback(__priv, ptp, config_hw_tstamping, __args)
+#define dn200_config_sub_second_increment(__priv, __args...) \
+	dn200_do_void_callback(__priv, ptp, config_sub_second_increment, __args)
+#define dn200_init_systime(__priv, __args...) \
+	dn200_do_callback(__priv, ptp, init_systime, __args)
+#define dn200_config_addend(__priv, __args...) \
+	dn200_do_callback(__priv, ptp, config_addend, __args)
+#define dn200_adjust_systime(__priv, __args...) \
+	dn200_do_callback(__priv, ptp, adjust_systime, __args)
+#define dn200_get_systime(__priv, __args...) \
+	dn200_do_void_callback(__priv, ptp, get_systime, __args)
+#define dn200_get_ptptime(__priv, __args...) \
+	dn200_do_void_callback(__priv, ptp, get_ptptime, __args)
+#define dn200_timestamp_interrupt(__priv, __args...) \
+	dn200_do_void_callback(__priv, ptp, timestamp_interrupt, __args)
+
+/* Helpers to manage the descriptors for chain and ring modes */
+struct dn200_mode_ops {
+	void (*init)(void *des, dma_addr_t phy_addr, unsigned int size);
+	unsigned int (*is_jumbo_frm)(int len, int ehn_desc);
+	int (*jumbo_frm)(void *priv, struct sk_buff *skb, int csum);
+	int (*set_16kib_bfsize)(int mtu);
+	void (*init_desc3)(struct dma_desc *p);
+	void (*refill_desc3)(void *priv, struct dma_desc *p);
+	void (*clean_desc3)(void *priv, struct dma_desc *p);
+};
+
+#define dn200_mode_init(__priv, __args...) \
+	dn200_do_void_callback(__priv, mode, init, __args)
+#define dn200_is_jumbo_frm(__priv, __args...) \
+	dn200_do_callback(__priv, mode, is_jumbo_frm, __args)
+#define dn200_jumbo_frm(__priv, __args...) \
+	dn200_do_callback(__priv, mode, jumbo_frm, __args)
+#define dn200_set_16kib_bfsize(__priv, __args...) \
+	dn200_do_callback(__priv, mode, set_16kib_bfsize, __args)
+#define dn200_init_desc3(__priv, __args...) \
+	dn200_do_void_callback(__priv, mode, init_desc3, __args)
+#define dn200_refill_desc3(__priv, __args...) \
+	dn200_do_void_callback(__priv, mode, refill_desc3, __args)
+#define dn200_clean_desc3(__priv, __args...) \
+	dn200_do_void_callback(__priv, mode, clean_desc3, __args)
+
+struct tc_cls_u32_offload;
+struct tc_cbs_qopt_offload;
+struct flow_cls_offload;
+struct tc_taprio_qopt_offload;
+struct tc_etf_qopt_offload;
+
+struct dn200_tc_ops {
+	int (*init)(struct dn200_priv *priv);
+	int (*setup_cls_u32)(struct dn200_priv *priv,
+			     struct tc_cls_u32_offload *cls);
+	int (*setup_cbs)(struct dn200_priv *priv,
+			 struct tc_cbs_qopt_offload *qopt);
+	int (*setup_cls)(struct dn200_priv *priv,
+			 struct flow_cls_offload *cls);
+	int (*setup_taprio)(struct dn200_priv *priv,
+			    struct tc_taprio_qopt_offload *qopt);
+	int (*setup_etf)(struct dn200_priv *priv,
+			 struct tc_etf_qopt_offload *qopt);
+};
+
+#define dn200_tc_init(__priv, __args...) \
+	dn200_do_callback(__priv, tc, init, __args)
+#define dn200_tc_setup_cls_u32(__priv, __args...) \
+	dn200_do_callback(__priv, tc, setup_cls_u32, __args)
+#define dn200_tc_setup_cbs(__priv, __args...) \
+	dn200_do_callback(__priv, tc, setup_cbs, __args)
+#define dn200_tc_setup_cls(__priv, __args...) \
+	dn200_do_callback(__priv, tc, setup_cls, __args)
+#define dn200_tc_setup_taprio(__priv, __args...) \
+	dn200_do_callback(__priv, tc, setup_taprio, __args)
+#define dn200_tc_setup_etf(__priv, __args...) \
+	dn200_do_callback(__priv, tc, setup_etf, __args)
+
+struct dn200_counters;
+
+struct dn200_mmc_ops {
+	void (*ctrl)(void __iomem *ioaddr, unsigned int mode);
+	void (*intr_all_mask)(void __iomem *ioaddr);
+	void (*read)(void __iomem *ioaddr, struct dn200_counters *mmc);
+	void (*err_clear)(void __iomem *ioaddr);
+};
+
+#define dn200_mmc_ctrl(__priv, __args...) \
+	dn200_do_void_callback(__priv, mmc, ctrl, __args)
+#define dn200_mmc_intr_all_mask(__priv, __args...) \
+	dn200_do_void_callback(__priv, mmc, intr_all_mask, __args)
+#define dn200_mmc_read(__priv, __args...) \
+	dn200_do_void_callback(__priv, mmc, read, __args)
+#define dn200_mmc_err_clear(__priv, __args...) \
+	dn200_do_void_callback(__priv, mmc, err_clear, __args)
+
+struct dn200_regs_off {
+	u32 ptp_off;
+	u32 mmc_off;
+};
+
+extern const struct dn200_ops dwmac100_ops;
+extern const struct dn200_dma_ops dwmac100_dma_ops;
+extern const struct dn200_ops dwmac1000_ops;
+extern const struct dn200_dma_ops dwmac1000_dma_ops;
+extern const struct dn200_ops dwmac4_ops;
+extern const struct dn200_dma_ops dwmac4_dma_ops;
+extern const struct dn200_ops dwmac410_ops;
+extern const struct dn200_dma_ops dwmac410_dma_ops;
+extern const struct dn200_ops dwmac510_ops;
+extern const struct dn200_ops dwxgmac210_ops;
+extern const struct dn200_ops dwxgmac_sriov_ops;
+extern const struct dn200_ops dwxgmac_purepf_ops;
+extern const struct dn200_dma_ops dwxgmac_dma_ops;
+
+extern const struct dn200_desc_ops dwxgmac210_desc_ops;
+extern const struct dn200_mmc_ops dwmac_mmc_ops;
+extern const struct dn200_mmc_ops dwxgmac_mmc_ops;
+
+#define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
+#define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
+
+int dn200_hwif_init(struct dn200_priv *priv);
+bool dn200_hwif_id_check(void __iomem *ioaddr);
+bool dn200_dp_hwif_id_check(void __iomem *ioaddr);
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/mmc.h b/drivers/net/ethernet/dapustor/dn200/mmc.h
new file mode 100644
index 000000000000..68f441755672
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/mmc.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#ifndef __MMC_H__
+#define __MMC_H__
+
+/* MMC control register */
+/* When set, all counter are reset */
+#define MMC_CNTRL_COUNTER_RESET		0x1
+/* When set, do not roll over zero after reaching the max value*/
+#define MMC_CNTRL_COUNTER_STOP_ROLLOVER	0x2
+#define MMC_CNTRL_RESET_ON_READ		0x4	/* Reset after reading */
+#define MMC_CNTRL_COUNTER_FREEZER	0x8	/* Freeze counter values to the current value.*/
+#define MMC_CNTRL_PRESET		0x10
+#define MMC_XGMAC_TX_PKT_GB		0x1c
+#define MMC_CNTRL_FULL_HALF_PRESET	0x20
+
+#define MMC_GMAC4_OFFSET		0x700
+#define MMC_GMAC3_X_OFFSET		0x100
+#define MMC_XGMAC_OFFSET		0x800
+
+#define MMC_XGMAC_RX_PKT_GB		0x100
+
+struct dn200_counters {
+	u64 mmc_tx_octetcount_gb;
+	u64 mmc_tx_framecount_gb;
+	u64 mmc_tx_broadcastframe_g;
+	u64 mmc_tx_multicastframe_g;
+	u64 mmc_tx_64_octets_gb;
+	u64 mmc_tx_65_to_127_octets_gb;
+	u64 mmc_tx_128_to_255_octets_gb;
+	u64 mmc_tx_256_to_511_octets_gb;
+	u64 mmc_tx_512_to_1023_octets_gb;
+	u64 mmc_tx_1024_to_max_octets_gb;
+	u64 mmc_tx_unicast_gb;
+	u64 mmc_tx_multicast_gb;
+	u64 mmc_tx_broadcast_gb;
+	u64 mmc_tx_underflow_error;
+	u64 mmc_tx_octetcount_g;
+	u64 mmc_tx_framecount_g;
+	u64 mmc_tx_pause_frame;
+	u64 mmc_tx_vlan_frame_g;
+	u64 mmc_tx_vlan_insert;
+	u64 mmc_tx_lpi_usec;
+	u64 mmc_tx_lpi_tran;
+
+	/* MMC RX counter registers */
+	u64 mmc_rx_framecount_gb;
+	u64 mmc_rx_octetcount_gb;
+	u64 mmc_rx_octetcount_g;
+	u64 mmc_rx_broadcastframe_g;
+	u64 mmc_rx_multicastframe_g;
+	u64 mmc_rx_crc_error;
+	u64 mmc_rx_align_error;
+	u32 mmc_rx_run_error;
+	u32 mmc_rx_jabber_error;
+	u32 mmc_rx_undersize_g;
+	u32 mmc_rx_oversize_g;
+	u64 mmc_rx_64_octets_gb;
+	u64 mmc_rx_65_to_127_octets_gb;
+	u64 mmc_rx_128_to_255_octets_gb;
+	u64 mmc_rx_256_to_511_octets_gb;
+	u64 mmc_rx_512_to_1023_octets_gb;
+	u64 mmc_rx_1024_to_max_octets_gb;
+	u64 mmc_rx_unicast_g;
+	u64 mmc_rx_length_error;
+	u64 mmc_rx_outofrangetype;
+	u64 mmc_rx_pause_frames;
+	u64 mmc_rx_fifo_overflow;
+	u64 mmc_rx_vlan_frames_gb;
+	u64 mmc_rx_vlan_strip;
+	u64 mmc_rx_fd_drop;
+	u64 mmc_rx_watchdog_error;
+	unsigned int mmc_rx_lpi_usec;
+	unsigned int mmc_rx_lpi_tran;
+	u64 mmc_rx_discard_pkt_gb;
+	u64 mmc_rx_discard_oct_gb;
+	unsigned int mmc_rx_align_err;
+
+	/* IPC */
+	unsigned int mmc_rx_ipc_intr_mask;
+	unsigned int mmc_rx_ipc_intr;
+	/* IPv4 */
+	u64 mmc_rx_ipv4_gd;
+	u64 mmc_rx_ipv4_hderr;
+	u64 mmc_rx_ipv4_nopay;
+	u64 mmc_rx_ipv4_frag;
+	u64 mmc_rx_ipv4_udsbl;
+
+	u64 mmc_rx_ipv4_gd_octets;
+	u64 mmc_rx_ipv4_hderr_octets;
+	u64 mmc_rx_ipv4_nopay_octets;
+	u64 mmc_rx_ipv4_frag_octets;
+	u64 mmc_rx_ipv4_udsbl_octets;
+
+	/* IPV6 */
+	u64 mmc_rx_ipv6_gd_octets;
+	u64 mmc_rx_ipv6_hderr_octets;
+	u64 mmc_rx_ipv6_nopay_octets;
+
+	u64 mmc_rx_ipv6_gd;
+	u64 mmc_rx_ipv6_hderr;
+	u64 mmc_rx_ipv6_nopay;
+
+	/* Protocols */
+	u64 mmc_rx_udp_gd;
+	u64 mmc_rx_udp_err;
+	u64 mmc_rx_tcp_gd;
+	u64 mmc_rx_tcp_err;
+	u64 mmc_rx_icmp_gd;
+	u64 mmc_rx_icmp_err;
+
+	u64 mmc_rx_udp_gd_octets;
+	u64 mmc_rx_udp_err_octets;
+	u64 mmc_rx_tcp_gd_octets;
+	u64 mmc_rx_tcp_err_octets;
+	u64 mmc_rx_icmp_gd_octets;
+	u64 mmc_rx_icmp_err_octets;
+
+	/* FPE */
+	unsigned int mmc_tx_fpe_fragment_cntr;
+	unsigned int mmc_tx_hold_req_cntr;
+	unsigned int mmc_rx_packet_assembly_err_cntr;
+	unsigned int mmc_rx_packet_assembly_ok_cntr;
+	unsigned int mmc_rx_fpe_fragment_cntr;
+};
+
+struct dn200_swcounters {
+	u64 mmc_rx_fd_drop;
+	u64 mmc_tx_vlan_insert;
+	u64 mmc_rx_vlan_strip;
+	u64 rx_mem_copy;
+	u64 tx_mem_copy;
+	u64 tx_iatu_updt_cnt;
+	u64 tx_iatu_match_cnt;
+	u64 tx_iatu_find_cnt;
+	u64 tx_iatu_recyc_cnt;
+	u64 hw_lock_fail_cnt;
+	u64 hw_lock_timeout;
+	u32 hw_lock_recfgs;
+};
+
+void dwxgmac_read_mmc_reg(void __iomem *addr, u32 reg, u64 *dest);
+
+#endif
diff --git a/drivers/net/ethernet/dapustor/dn200/mmc_core.c b/drivers/net/ethernet/dapustor/dn200/mmc_core.c
new file mode 100644
index 000000000000..f488e4724d01
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/mmc_core.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include "hwif.h"
+#include "mmc.h"
+/* MAC Management Counters register offset */
+
+#define MMC_CNTRL		0x00	/* MMC Control */
+#define MMC_RX_INTR		0x04	/* MMC RX Interrupt */
+#define MMC_TX_INTR		0x08	/* MMC TX Interrupt */
+#define MMC_RX_INTR_MASK	0x0c	/* MMC Interrupt Mask */
+#define MMC_TX_INTR_MASK	0x10	/* MMC Interrupt Mask */
+#define MMC_DEFAULT_MASK	0xffffffff
+
+/* XGMAC MMC Registers */
+#define MMC_XGMAC_TX_OCTET_GB		0x14
+#define MMC_XGMAC_TX_BROAD_PKT_G	0x24
+#define MMC_XGMAC_TX_MULTI_PKT_G	0x2c
+#define MMC_XGMAC_TX_64OCT_GB		0x34
+#define MMC_XGMAC_TX_65OCT_GB		0x3c
+#define MMC_XGMAC_TX_128OCT_GB		0x44
+#define MMC_XGMAC_TX_256OCT_GB		0x4c
+#define MMC_XGMAC_TX_512OCT_GB		0x54
+#define MMC_XGMAC_TX_1024OCT_GB		0x5c
+#define MMC_XGMAC_TX_UNI_PKT_GB		0x64
+#define MMC_XGMAC_TX_MULTI_PKT_GB	0x6c
+#define MMC_XGMAC_TX_BROAD_PKT_GB	0x74
+#define MMC_XGMAC_TX_UNDER		0x7c
+#define MMC_XGMAC_TX_OCTET_G		0x84
+#define MMC_XGMAC_TX_PKT_G		0x8c
+#define MMC_XGMAC_TX_PAUSE		0x94
+#define MMC_XGMAC_TX_VLAN_PKT_G		0x9c
+#define MMC_XGMAC_TX_LPI_USEC		0xa4
+#define MMC_XGMAC_TX_LPI_TRAN		0xa8
+
+#define MMC_XGMAC_RX_OCTET_GB		0x108
+#define MMC_XGMAC_RX_OCTET_G		0x110
+#define MMC_XGMAC_RX_BROAD_PKT_G	0x118
+#define MMC_XGMAC_RX_MULTI_PKT_G	0x120
+#define MMC_XGMAC_RX_CRC_ERR		0x128
+#define MMC_XGMAC_RX_RUNT_ERR		0x130
+#define MMC_XGMAC_RX_JABBER_ERR		0x134
+#define MMC_XGMAC_RX_UNDERSIZE		0x138
+#define MMC_XGMAC_RX_OVERSIZE		0x13c
+#define MMC_XGMAC_RX_64OCT_GB		0x140
+#define MMC_XGMAC_RX_65OCT_GB		0x148
+#define MMC_XGMAC_RX_128OCT_GB		0x150
+#define MMC_XGMAC_RX_256OCT_GB		0x158
+#define MMC_XGMAC_RX_512OCT_GB		0x160
+#define MMC_XGMAC_RX_1024OCT_GB		0x168
+#define MMC_XGMAC_RX_UNI_PKT_G		0x170
+#define MMC_XGMAC_RX_LENGTH_ERR		0x178
+#define MMC_XGMAC_RX_OUTOFRANGE		0x180
+#define MMC_XGMAC_RX_PAUSE		0x188
+#define MMC_XGMAC_RX_FIFOOVER_PKT	0x190
+#define MMC_XGMAC_RX_VLAN_PKT_GB	0x198
+#define MMC_XGMAC_RX_WATCHDOG_ERR	0x1a0
+#define MMC_XGMAC_RX_LPI_USEC		0x1a4
+#define MMC_XGMAC_RX_LPI_TRAN		0x1a8
+#define MMC_XGMAC_RX_DISCARD_PKT_GB	0x1ac
+#define MMC_XGMAC_RX_DISCARD_OCT_GB	0x1b4
+#define MMC_XGMAC_RX_ALIGN_ERR_PKT	0x1bc
+
+#define MMC_XGMAC_TX_FPE_FRAG		0x208
+#define MMC_XGMAC_TX_HOLD_REQ		0x20c
+#define MMC_XGMAC_RX_PKT_ASSEMBLY_ERR	0x228
+#define MMC_XGMAC_RX_PKT_SMD_ERR	0x22c
+#define MMC_XGMAC_RX_PKT_ASSEMBLY_OK	0x230
+#define MMC_XGMAC_RX_FPE_FRAG		0x234
+#define MMC_XGMAC_RX_IPC_INTR_MASK	0x25c
+
+/*mmc_core*/
+/* IPC*/
+#define MMC_XGMAC_RX_IPC_INTR 0x260
+/* IPv4*/
+#define MMC_XGMAC_RX_IPV4_GD 0x264
+#define MMC_XGMAC_RX_IPV4_HDERR 0x26C
+#define MMC_XGMAC_RX_IPV4_NOPAY 0x274
+#define MMC_XGMAC_RX_IPV4_FRAG 0x27C
+#define MMC_XGMAC_RX_IPV4_UDSBL 0x284
+
+#define MMC_XGMAC_RX_IPV4_GD_OCTETS 0x2D4
+#define MMC_XGMAC_RX_IPV4_HDERR_OCTETS 0x2DC
+#define MMC_XGMAC_RX_IPV4_NOPAY_OCTETS 0x2E4
+#define MMC_XGMAC_RX_IPV4_FRAG_OCTETS 0x2EC
+#define MMC_XGMAC_RX_IPV4_UDSBL_OCTETS 0x2F4
+
+/* IPV6*/
+#define MMC_XGMAC_RX_IPV6_GD_OCTETS 0x28C
+#define MMC_XGMAC_RX_IPV6_HDERR_OCTETS 0x294
+#define MMC_XGMAC_RX_IPV6_NOPAY_OCTETS 0x29C
+
+#define MMC_XGMAC_RX_IPV6_GD 0x2FC
+#define MMC_XGMAC_RX_IPV6_HDERR 0x304
+#define MMC_XGMAC_RX_IPV6_NOPAY 0x30C
+
+/* Protocols*/
+#define MMC_XGMAC_RX_UDP_GD 0x2A4
+#define MMC_XGMAC_RX_UDP_ERR 0x2AC
+#define MMC_XGMAC_RX_TCP_GD 0x2B4
+#define MMC_XGMAC_RX_TCP_ERR 0x2BC
+#define MMC_XGMAC_RX_ICMP_GD 0x2C4
+#define MMC_XGMAC_RX_ICMP_ERR 0x2CC
+
+#define MMC_XGMAC_RX_UDP_GD_OCTETS 0x314
+#define MMC_XGMAC_RX_UDP_ERR_OCTETS 0x31C
+#define MMC_XGMAC_RX_TCP_GD_OCTETS 0x324
+#define MMC_XGMAC_RX_TCP_ERR_OCTETS 0x32C
+#define MMC_XGMAC_RX_ICMP_GD_OCTETS 0x334
+#define MMC_XGMAC_RX_ICMP_ERR_OCTETS 0x33C
+
+static unsigned int dn200_mmc_ctrl_set(unsigned int value)
+{
+	value |= BIT(31);
+	return value;
+}
+
+static void dwxgmac_mmc_ctrl(void __iomem *mmcaddr, unsigned int mode)
+{
+	u32 value = readl(mmcaddr + MMC_CNTRL);
+
+	value |= (mode & 0x3F);
+
+	value = dn200_mmc_ctrl_set(value);
+
+	writel(value, mmcaddr + MMC_CNTRL);
+}
+
+static void dwxgmac_mmc_intr_all_mask(void __iomem *mmcaddr)
+{
+	writel(0x0, mmcaddr + MMC_RX_INTR_MASK);
+	writel(0x0, mmcaddr + MMC_TX_INTR_MASK);
+	writel(MMC_DEFAULT_MASK, mmcaddr + MMC_XGMAC_RX_IPC_INTR_MASK);
+}
+
+void dwxgmac_read_mmc_reg(void __iomem *addr, u32 reg, u64 *dest)
+{
+	u64 tmp = 0;
+
+	tmp += readl(addr + reg);
+	tmp += ((u64) readl(addr + reg + 0x4)) << 32;
+	*dest = *dest + tmp;
+}
+
+static void dn200_mmc_extra_read(void __iomem *mmcaddr,
+				 struct dn200_counters *mmc)
+{
+	mmc->mmc_rx_ipc_intr_mask +=
+	    readl(mmcaddr + MMC_XGMAC_RX_IPC_INTR_MASK);
+	mmc->mmc_rx_ipc_intr += readl(mmcaddr + MMC_XGMAC_RX_IPC_INTR);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_GD,
+			     &mmc->mmc_rx_ipv4_gd);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_HDERR,
+			     &mmc->mmc_rx_ipv4_hderr);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_NOPAY,
+			     &mmc->mmc_rx_ipv4_nopay);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_FRAG,
+			     &mmc->mmc_rx_ipv4_frag);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_UDSBL,
+			     &mmc->mmc_rx_ipv4_udsbl);
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_GD_OCTETS,
+			     &mmc->mmc_rx_ipv4_gd_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_HDERR_OCTETS,
+			     &mmc->mmc_rx_ipv4_hderr_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_NOPAY_OCTETS,
+			     &mmc->mmc_rx_ipv4_nopay_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_FRAG_OCTETS,
+			     &mmc->mmc_rx_ipv4_frag_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_UDSBL_OCTETS,
+			     &mmc->mmc_rx_ipv4_udsbl_octets);
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_GD_OCTETS,
+			     &mmc->mmc_rx_ipv6_gd_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_HDERR_OCTETS,
+			     &mmc->mmc_rx_ipv6_hderr_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_NOPAY_OCTETS,
+			     &mmc->mmc_rx_ipv6_nopay_octets);
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_GD,
+			     &mmc->mmc_rx_ipv6_gd);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_HDERR,
+			     &mmc->mmc_rx_ipv6_hderr);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV6_NOPAY,
+			     &mmc->mmc_rx_ipv6_nopay);
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_GD, &mmc->mmc_rx_udp_gd);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_ERR,
+			     &mmc->mmc_rx_udp_err);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_GD, &mmc->mmc_rx_tcp_gd);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_ERR,
+			     &mmc->mmc_rx_tcp_err);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_GD,
+			     &mmc->mmc_rx_icmp_gd);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_ERR,
+			     &mmc->mmc_rx_icmp_err);
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_GD_OCTETS,
+			     &mmc->mmc_rx_udp_gd_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_ERR_OCTETS,
+			     &mmc->mmc_rx_udp_err_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_GD_OCTETS,
+			     &mmc->mmc_rx_tcp_gd_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_ERR_OCTETS,
+			     &mmc->mmc_rx_tcp_err_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_GD_OCTETS,
+			     &mmc->mmc_rx_icmp_gd_octets);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_ERR_OCTETS,
+			     &mmc->mmc_rx_icmp_err_octets);
+}
+
+/* when link down, need save xgmac's count.
+ * when link up, need clear some wrong count which
+ * come from down->up state.
+ */
+static void dwxgmac_mmc_err_clear(void __iomem *mmcaddr)
+{
+	u64 value = 0;
+
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_CRC_ERR,
+			     &value);
+	value += readl(mmcaddr + MMC_XGMAC_RX_RUNT_ERR);
+	value += readl(mmcaddr + MMC_XGMAC_RX_JABBER_ERR);
+	value += readl(mmcaddr + MMC_XGMAC_RX_UNDERSIZE);
+	value += readl(mmcaddr + MMC_XGMAC_RX_OVERSIZE);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_LENGTH_ERR,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_OUTOFRANGE,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_FIFOOVER_PKT,
+			     &value);
+	value +=
+	    readl(mmcaddr + MMC_XGMAC_RX_WATCHDOG_ERR);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_DISCARD_PKT_GB,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_DISCARD_OCT_GB,
+			     &value);
+	value += readl(mmcaddr + MMC_XGMAC_RX_ALIGN_ERR_PKT);
+	value +=
+	    readl(mmcaddr + MMC_XGMAC_RX_PKT_ASSEMBLY_ERR);
+	value +=
+		readl(mmcaddr + MMC_XGMAC_RX_PKT_SMD_ERR);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_HDERR,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_IPV4_NOPAY,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_ERR,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UDP_ERR_OCTETS,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_ERR,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_TCP_ERR_OCTETS,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_ERR,
+			     &value);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_ICMP_ERR_OCTETS,
+			     &value);
+}
+
+/* This reads the MAC core counters (if actaully supported).
+ * by default the MMC core is programmed to reset each
+ * counter after a read. So all the field of the mmc struct
+ * have to be incremented.
+ */
+static void dwxgmac_mmc_read(void __iomem *mmcaddr, struct dn200_counters *mmc)
+{
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_OCTET_GB,
+			     &mmc->mmc_tx_octetcount_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_BROAD_PKT_G,
+			     &mmc->mmc_tx_broadcastframe_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_MULTI_PKT_G,
+			     &mmc->mmc_tx_multicastframe_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_64OCT_GB,
+			     &mmc->mmc_tx_64_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_65OCT_GB,
+			     &mmc->mmc_tx_65_to_127_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_128OCT_GB,
+			     &mmc->mmc_tx_128_to_255_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_256OCT_GB,
+			     &mmc->mmc_tx_256_to_511_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_512OCT_GB,
+			     &mmc->mmc_tx_512_to_1023_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_1024OCT_GB,
+			     &mmc->mmc_tx_1024_to_max_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_UNI_PKT_GB,
+			     &mmc->mmc_tx_unicast_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_MULTI_PKT_GB,
+			     &mmc->mmc_tx_multicast_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_BROAD_PKT_GB,
+			     &mmc->mmc_tx_broadcast_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_UNDER,
+			     &mmc->mmc_tx_underflow_error);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_OCTET_G,
+			     &mmc->mmc_tx_octetcount_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_PKT_G,
+			     &mmc->mmc_tx_framecount_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_PAUSE,
+			     &mmc->mmc_tx_pause_frame);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_VLAN_PKT_G,
+			     &mmc->mmc_tx_vlan_frame_g);
+	mmc->mmc_tx_lpi_usec += readl(mmcaddr + MMC_XGMAC_TX_LPI_USEC);
+	mmc->mmc_tx_lpi_tran += readl(mmcaddr + MMC_XGMAC_TX_LPI_USEC);
+
+	/* MMC RX counter registers */
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_OCTET_GB,
+			     &mmc->mmc_rx_octetcount_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_OCTET_G,
+			     &mmc->mmc_rx_octetcount_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_BROAD_PKT_G,
+			     &mmc->mmc_rx_broadcastframe_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_MULTI_PKT_G,
+			     &mmc->mmc_rx_multicastframe_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_CRC_ERR,
+			     &mmc->mmc_rx_crc_error);
+	mmc->mmc_rx_run_error += readl(mmcaddr + MMC_XGMAC_RX_RUNT_ERR);
+	mmc->mmc_rx_jabber_error += readl(mmcaddr + MMC_XGMAC_RX_JABBER_ERR);
+	mmc->mmc_rx_undersize_g += readl(mmcaddr + MMC_XGMAC_RX_UNDERSIZE);
+	mmc->mmc_rx_oversize_g += readl(mmcaddr + MMC_XGMAC_RX_OVERSIZE);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_64OCT_GB,
+			     &mmc->mmc_rx_64_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_65OCT_GB,
+			     &mmc->mmc_rx_65_to_127_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_128OCT_GB,
+			     &mmc->mmc_rx_128_to_255_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_256OCT_GB,
+			     &mmc->mmc_rx_256_to_511_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_512OCT_GB,
+			     &mmc->mmc_rx_512_to_1023_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_1024OCT_GB,
+			     &mmc->mmc_rx_1024_to_max_octets_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_UNI_PKT_G,
+			     &mmc->mmc_rx_unicast_g);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_LENGTH_ERR,
+			     &mmc->mmc_rx_length_error);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_OUTOFRANGE,
+			     &mmc->mmc_rx_outofrangetype);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_PAUSE,
+			     &mmc->mmc_rx_pause_frames);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_FIFOOVER_PKT,
+			     &mmc->mmc_rx_fifo_overflow);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_VLAN_PKT_GB,
+			     &mmc->mmc_rx_vlan_frames_gb);
+	mmc->mmc_rx_watchdog_error +=
+	    readl(mmcaddr + MMC_XGMAC_RX_WATCHDOG_ERR);
+	mmc->mmc_rx_lpi_usec += readl(mmcaddr + MMC_XGMAC_RX_LPI_USEC);
+	mmc->mmc_rx_lpi_tran += readl(mmcaddr + MMC_XGMAC_RX_LPI_USEC);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_DISCARD_PKT_GB,
+			     &mmc->mmc_rx_discard_pkt_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_DISCARD_OCT_GB,
+			     &mmc->mmc_rx_discard_oct_gb);
+	mmc->mmc_rx_align_err += readl(mmcaddr + MMC_XGMAC_RX_ALIGN_ERR_PKT);
+
+	mmc->mmc_tx_fpe_fragment_cntr += readl(mmcaddr + MMC_XGMAC_TX_FPE_FRAG);
+	mmc->mmc_tx_hold_req_cntr += readl(mmcaddr + MMC_XGMAC_TX_HOLD_REQ);
+	mmc->mmc_rx_packet_assembly_err_cntr +=
+	    readl(mmcaddr + MMC_XGMAC_RX_PKT_ASSEMBLY_ERR);
+	mmc->mmc_rx_packet_assembly_ok_cntr +=
+	    readl(mmcaddr + MMC_XGMAC_RX_PKT_ASSEMBLY_OK);
+	mmc->mmc_rx_fpe_fragment_cntr += readl(mmcaddr + MMC_XGMAC_RX_FPE_FRAG);
+
+	dn200_mmc_extra_read(mmcaddr, mmc);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_TX_PKT_GB,
+			     &mmc->mmc_tx_framecount_gb);
+	dwxgmac_read_mmc_reg(mmcaddr, MMC_XGMAC_RX_PKT_GB,
+			     &mmc->mmc_rx_framecount_gb);
+}
+
+const struct dn200_mmc_ops dwxgmac_mmc_ops = {
+	.ctrl = dwxgmac_mmc_ctrl,
+	.intr_all_mask = dwxgmac_mmc_intr_all_mask,
+	.read = dwxgmac_mmc_read,
+	.err_clear = dwxgmac_mmc_err_clear,
+};
diff --git a/drivers/net/ethernet/dapustor/dn200/ring_mode.c b/drivers/net/ethernet/dapustor/dn200/ring_mode.c
new file mode 100644
index 000000000000..86a8c6b3af6e
--- /dev/null
+++ b/drivers/net/ethernet/dapustor/dn200/ring_mode.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024, DapuStor Corporation.
+ */
+
+#include "dn200.h"
+
+static int jumbo_frm(void *p, struct sk_buff *skb, int csum)
+{
+	struct dn200_tx_queue *tx_q = (struct dn200_tx_queue *)p;
+	unsigned int nopaged_len = skb_headlen(skb);
+	struct dn200_priv *priv = tx_q->priv_data;
+	unsigned int entry = tx_q->cur_tx;
+	unsigned int bmax, len, des2;
+	struct dma_desc *desc;
+
+	desc = tx_q->dma_tx + entry;
+
+	if (priv->plat->enh_desc)
+		bmax = BUF_SIZE_8KiB;
+	else
+		bmax = BUF_SIZE_2KiB;
+
+	len = nopaged_len - bmax;
+
+	if (nopaged_len > BUF_SIZE_8KiB) {
+
+		des2 = dma_map_single(priv->device, skb->data, bmax,
+				      DMA_TO_DEVICE);
+		desc->des2 = cpu_to_le32(des2);
+		if (dma_mapping_error(priv->device, des2))
+			return -1;
+
+		tx_q->tx_skbuff_dma[entry].buf = des2;
+		tx_q->tx_skbuff_dma[entry].len = bmax;
+		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
+
+		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
+		dn200_prepare_tx_desc(priv, desc, 1, bmax, csum,
+				      DN200_RING_MODE, 0, false, skb->len);
+		tx_q->tx_skbuff[entry] = NULL;
+		entry = DN200_GET_ENTRY(entry, priv->dma_tx_size);
+
+		desc = tx_q->dma_tx + entry;
+
+		des2 = dma_map_single(priv->device, skb->data + bmax, len,
+				      DMA_TO_DEVICE);
+		desc->des2 = cpu_to_le32(des2);
+		if (dma_mapping_error(priv->device, des2))
+			return -1;
+		tx_q->tx_skbuff_dma[entry].buf = des2;
+		tx_q->tx_skbuff_dma[entry].len = len;
+		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
+
+		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
+		dn200_prepare_tx_desc(priv, desc, 0, len, csum,
+				      DN200_RING_MODE, 1,
+				      !skb_is_nonlinear(skb), skb->len);
+	} else {
+		des2 = dma_map_single(priv->device, skb->data,
+				      nopaged_len, DMA_TO_DEVICE);
+		desc->des2 = cpu_to_le32(des2);
+		if (dma_mapping_error(priv->device, des2))
+			return -1;
+		tx_q->tx_skbuff_dma[entry].buf = des2;
+		tx_q->tx_skbuff_dma[entry].len = nopaged_len;
+		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
+		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
+		dn200_prepare_tx_desc(priv, desc, 1, nopaged_len, csum,
+				      DN200_RING_MODE, 0,
+				      !skb_is_nonlinear(skb), skb->len);
+	}
+
+	tx_q->cur_tx = entry;
+
+	return entry;
+}
+
+static unsigned int is_jumbo_frm(int len, int enh_desc)
+{
+	unsigned int ret = 0;
+
+	if (len >= BUF_SIZE_4KiB)
+		ret = 1;
+
+	return ret;
+}
+
+static void refill_desc3(void *priv_ptr, struct dma_desc *p)
+{
+	struct dn200_rx_queue *rx_q = priv_ptr;
+	struct dn200_priv *priv = rx_q->priv_data;
+
+	/* Fill DES3 in case of RING mode */
+	if (priv->dma_buf_sz == BUF_SIZE_16KiB)
+		p->des3 = cpu_to_le32(le32_to_cpu(p->des2) + BUF_SIZE_8KiB);
+}
+
+/* In ring mode we need to fill the desc3 because it is used as buffer */
+static void init_desc3(struct dma_desc *p)
+{
+	p->des3 = cpu_to_le32(le32_to_cpu(p->des2) + BUF_SIZE_8KiB);
+}
+
+static void clean_desc3(void *priv_ptr, struct dma_desc *p)
+{
+	struct dn200_tx_queue *tx_q = (struct dn200_tx_queue *)priv_ptr;
+	struct dn200_priv *priv = tx_q->priv_data;
+	unsigned int entry = tx_q->dirty_tx;
+
+	/* des3 is only used for jumbo frames tx or time stamping */
+	if (unlikely(tx_q->tx_skbuff_dma[entry].is_jumbo ||
+		     (tx_q->tx_skbuff_dma[entry].last_segment
+		      && priv->hwts_tx_en)))
+		p->des3 = 0;
+}
+
+static int set_16kib_bfsize(int mtu)
+{
+	int ret = 0;
+
+	if (unlikely(mtu > BUF_SIZE_8KiB))
+		ret = BUF_SIZE_16KiB;
+	return ret;
+}
+
+const struct dn200_mode_ops ring_mode_ops = {
+	.is_jumbo_frm = is_jumbo_frm,
+	.jumbo_frm = jumbo_frm,
+	.refill_desc3 = refill_desc3,
+	.init_desc3 = init_desc3,
+	.clean_desc3 = clean_desc3,
+	.set_16kib_bfsize = set_16kib_bfsize,
+};
-- 
Gitee


From 97bc5c87a4ba9bf36cf86407b7d13ba8c7e1b335 Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:22 +0800
Subject: [PATCH 02/59] arm64/sysreg: add HDBSS related register information

commit 16ab2a2fee8bf10931c8821c56f724a06d0f1e68 openEuler

The ARM architecture added the HDBSS feature and descriptions of
related registers (HDBSSBR/HDBSSPROD) in the DDI0601(ID121123) version,
add them to Linux.

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/esr.h     |  2 ++
 arch/arm64/include/asm/kvm_arm.h |  1 +
 arch/arm64/tools/sysreg          | 28 ++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index b04575ea3a35..ca9a13c9a668 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -144,6 +144,8 @@
 #define ESR_ELx_CM 		(UL(1) << ESR_ELx_CM_SHIFT)
 
 /* ISS2 field definitions for Data Aborts */
+#define ESR_ELx_HDBSSF_SHIFT	(11)
+#define ESR_ELx_HDBSSF		(UL(1) << ESR_ELx_HDBSSF_SHIFT)
 #define ESR_ELx_TnD_SHIFT	(10)
 #define ESR_ELx_TnD 		(UL(1) << ESR_ELx_TnD_SHIFT)
 #define ESR_ELx_TagAccess_SHIFT	(9)
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 01613c85685e..fd598c539ba5 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -128,6 +128,7 @@
 			 TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)
 
 /* VTCR_EL2 Registers bits */
+#define VTCR_EL2_HDBSS		(1UL << 45)
 #define VTCR_EL2_RES1		(1U << 31)
 #define VTCR_EL2_HD		(1 << 22)
 #define VTCR_EL2_HA		(1 << 21)
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index baded26ab7d7..963177485fd8 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -2477,6 +2477,34 @@ Sysreg	SMCR_EL2	3	4	1	2	6
 Fields	SMCR_ELx
 EndSysreg
 
+Sysreg	HDBSSBR_EL2	3	4	2	3	2
+Res0	63:56
+Field	55:12	BADDR
+Res0	11:4
+Enum	3:0	SZ
+	0b0001	8KB
+	0b0010	16KB
+	0b0011	32KB
+	0b0100	64KB
+	0b0101	128KB
+	0b0110	256KB
+	0b0111	512KB
+	0b1000	1MB
+	0b1001	2MB
+EndEnum
+EndSysreg
+
+Sysreg	HDBSSPROD_EL2	3	4	2	3	3
+Res0	63:32
+Enum	31:26	FSC
+	0b000000	OK
+	0b010000	ExternalAbort
+	0b101000	GPF
+EndEnum
+Res0	25:19
+Field	18:0	INDEX
+EndSysreg
+
 Sysreg	DACR32_EL2	3	4	3	0	0
 Res0	63:32
 Field	31:30	D15
-- 
Gitee


From 0c05938e16a7b9107696dcad1cb8501515f1a2e0 Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:23 +0800
Subject: [PATCH 03/59] arm64/kvm: support set the DBM attr during memory abort

commit 74a1397438e01e10764816526b621a965d5a9cf7 openEuler

Since the ARMv8, the page entry has supported the DBM attribute.
Support set the attr during user_mem_abort().

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 1 +
 arch/arm64/kvm/hyp/pgtable.c         | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4153b77d8aae..0c0ae56e8163 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -189,6 +189,7 @@ enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_R			= BIT(2),
 
 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+	KVM_PGTABLE_PROT_DBM			= BIT(4),
 
 	KVM_PGTABLE_PROT_SW0			= BIT(55),
 	KVM_PGTABLE_PROT_SW1			= BIT(56),
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4e3e497784fb..08bd10447103 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -44,6 +44,8 @@
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_HI_S2_DBM	BIT(51)
+
 #define KVM_PTE_LEAF_ATTR_HI_S1_GP	BIT(50)
 
 #define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
@@ -711,6 +713,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
 	if (prot & KVM_PGTABLE_PROT_W)
 		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 
+	if (prot & KVM_PGTABLE_PROT_DBM)
+		attr |= KVM_PTE_LEAF_ATTR_HI_S2_DBM;
+
 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
 	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
 	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
@@ -1315,6 +1320,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 	if (prot & KVM_PGTABLE_PROT_W)
 		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 
+	if (prot & KVM_PGTABLE_PROT_DBM)
+		set |= KVM_PTE_LEAF_ATTR_HI_S2_DBM;
+
 	if (prot & KVM_PGTABLE_PROT_X)
 		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
 
-- 
Gitee


From e10171c32cc3e173b41fbfb95d62182cd52a76da Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:24 +0800
Subject: [PATCH 04/59] arm64/kvm: using ioctl to enable/disable the HDBSS
 feature

commit bd6106e348d70cafe3f030ba6f60fb010f09b82e openEuler

In ARM64, the buffer size corresponding to the HDBSS feature is
configurable. Therefore, we cannot enable the HDBSS feature during
KVM initialization, but we should enable it when triggering a
live migration, where the buffer size can be configured by the user.

The KVM_CAP_ARM_HW_DIRTY_STATE_TRACK ioctl is added to enable/disable
this feature. Users (such as qemu) can invoke the ioctl to enable
HDBSS at the beginning of the migration and disable the feature by
invoking the ioctl again at the end of the migration with size set to 0.

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/cpufeature.h | 12 +++++
 arch/arm64/include/asm/kvm_host.h   |  6 +++
 arch/arm64/include/asm/kvm_mmu.h    | 12 +++++
 arch/arm64/include/asm/sysreg.h     | 12 +++++
 arch/arm64/kvm/arm.c                | 70 +++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/vhe/switch.c     |  1 +
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c  |  2 +
 arch/arm64/kvm/mmu.c                |  3 ++
 arch/arm64/kvm/reset.c              |  9 +++-
 include/linux/kvm_host.h            |  1 +
 include/uapi/linux/kvm.h            |  2 +
 tools/include/uapi/linux/kvm.h      |  1 +
 12 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 87752218051c..053dfd58abf9 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -753,6 +753,18 @@ static __always_inline bool system_supports_fpsimd(void)
 	return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD);
 }
 
+static inline bool system_supports_hdbss(void)
+{
+	u64 mmfr1;
+	u32 val;
+
+	mmfr1 =	read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+	val = cpuid_feature_extract_unsigned_field(mmfr1,
+						ID_AA64MMFR1_EL1_HAFDBS_SHIFT);
+
+	return val == ID_AA64MMFR1_EL1_HAFDBS_HDBSS;
+}
+
 static inline bool system_uses_hw_pan(void)
 {
 	return IS_ENABLED(CONFIG_ARM64_PAN) &&
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d78c3fe357cf..49dfd36f0632 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -613,6 +613,12 @@ struct kvm_vcpu_arch {
 
 	/* Realm meta data */
 	struct realm_rec rec;
+
+	/* HDBSS registers info */
+	struct {
+		u64 br_el2;
+		u64 prod_el2;
+	} hdbss;
 };
 
 /*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index a425ecdd7be0..5d90a332f2d7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -335,6 +335,18 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
 	asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
 }
 
+static __always_inline void __load_hdbss(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->kvm->enable_hdbss)
+		return;
+
+	write_sysreg_s(vcpu->arch.hdbss.br_el2, SYS_HDBSSBR_EL2);
+	write_sysreg_s(vcpu->arch.hdbss.prod_el2, SYS_HDBSSPROD_EL2);
+
+	dsb(sy);
+	isb();
+}
+
 static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
 {
 	return container_of(mmu->arch, struct kvm, arch);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 6727dbf17478..830862ebfd31 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1028,6 +1028,18 @@
 
 #define PIRx_ELx_PERM(idx, perm)	((perm) << ((idx) * 4))
 
+/*
+ * Definitions for the HDBSS feature
+ */
+#define HDBSS_MAX_SIZE		HDBSSBR_EL2_SZ_2MB
+
+#define HDBSSBR_EL2(baddr, sz)	(((baddr) & GENMASK(55, 12 + sz)) | \
+				 ((sz) << HDBSSBR_EL2_SZ_SHIFT))
+#define HDBSSBR_BADDR(br)	((br) & GENMASK(55, (12 + HDBSSBR_SZ(br))))
+#define HDBSSBR_SZ(br)		(((br) & HDBSSBR_EL2_SZ_MASK) >> HDBSSBR_EL2_SZ_SHIFT)
+
+#define HDBSSPROD_IDX(prod)	(((prod) & HDBSSPROD_EL2_INDEX_MASK) >> HDBSSPROD_EL2_INDEX_SHIFT)
+
 #define ARM64_FEATURE_FIELD_BITS	4
 
 /* Defined for compatibility only, do not add new users. */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7ff3d49827b4..c41fd1d57f53 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -80,6 +80,70 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 
+static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
+				    struct kvm_enable_cap *cap)
+{
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
+	struct page *hdbss_pg;
+	int size = cap->args[0];
+
+	if (!system_supports_hdbss()) {
+		kvm_err("This system does not support HDBSS!\n");
+		return -EINVAL;
+	}
+
+	if (size < 0 || size > HDBSS_MAX_SIZE) {
+		kvm_err("Invalid HDBSS buffer size: %d!\n", size);
+		return -EINVAL;
+	}
+
+	/* Enable the HDBSS feature if size > 0, otherwise disable it. */
+	if (size) {
+		kvm->enable_hdbss = true;
+		kvm->arch.vtcr |= VTCR_EL2_HD | VTCR_EL2_HDBSS;
+
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			hdbss_pg = alloc_pages(GFP_KERNEL, size);
+			if (!hdbss_pg) {
+				kvm_err("Alloc HDBSS buffer failed!\n");
+				return -EINVAL;
+			}
+
+			vcpu->arch.hdbss.br_el2 = HDBSSBR_EL2(page_to_phys(hdbss_pg), size);
+			vcpu->arch.hdbss.prod_el2 = 0;
+
+			/*
+			 * We should kick vcpus out of guest mode here to
+			 * load new vtcr value to vtcr_el2 register when
+			 * re-enter guest mode.
+			 */
+			kvm_vcpu_kick(vcpu);
+		}
+
+		kvm_info("Enable HDBSS success, HDBSS buffer size: %d\n", size);
+	} else if (kvm->enable_hdbss) {
+		kvm->arch.vtcr &= ~(VTCR_EL2_HD | VTCR_EL2_HDBSS);
+
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			/* Kick vcpus to flush hdbss buffer. */
+			kvm_vcpu_kick(vcpu);
+
+			hdbss_pg = phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2));
+			if (hdbss_pg)
+				__free_pages(hdbss_pg, HDBSSBR_SZ(vcpu->arch.hdbss.br_el2));
+
+			vcpu->arch.hdbss.br_el2 = 0;
+			vcpu->arch.hdbss.prod_el2 = 0;
+		}
+
+		kvm->enable_hdbss = false;
+		kvm_info("Disable HDBSS success\n");
+	}
+
+	return 0;
+}
+
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			    struct kvm_enable_cap *cap)
 {
@@ -132,6 +196,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		r = kvm_realm_enable_cap(kvm, cap);
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
+		r = kvm_cap_arm_enable_hdbss(kvm, cap);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -367,6 +434,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_RME:
 		r = static_key_enabled(&kvm_rme_is_available);
 		break;
+	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
+		r = system_supports_hdbss();
+		break;
 	default:
 		r = 0;
 	}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index a38bafc27d65..edf9eb81dcdb 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -226,6 +226,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	 * __activate_traps clear HCR_EL2.TGE (among other things).
 	 */
 	__load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
+	__load_hdbss(vcpu);
 	__activate_traps(vcpu);
 
 	__kvm_adjust_pc(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 5cb4b70e0aef..236d07c1b0b8 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -92,6 +92,8 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
 	__sysreg_restore_el1_state(guest_ctxt);
 	__mpam_guest_load();
 
+	__load_hdbss(vcpu);
+
 	vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
 
 	activate_traps_vhe_load(vcpu);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 512a6b743d04..70969604cf4e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1660,6 +1660,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
+	if (kvm->enable_hdbss && logging_active)
+		prot |= KVM_PGTABLE_PROT_DBM;
+
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
 
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 38282c3b5236..c98f6bc5fcc5 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -133,7 +133,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
 		kfree(buf);
 		return ret;
 	}
-	
+
 	vcpu->arch.sve_state = buf;
 	vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED);
 	return 0;
@@ -176,6 +176,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
 void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	void *sve_state = vcpu->arch.sve_state;
+	struct page *hdbss_pg;
 
 	kvm_vcpu_unshare_task_fp(vcpu);
 	kvm_unshare_hyp(vcpu, vcpu + 1);
@@ -184,6 +185,12 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kfree(sve_state);
 	kfree(vcpu->arch.ccsidr);
 	kvm_destroy_rec(vcpu);
+
+	if (vcpu->arch.hdbss.br_el2) {
+		hdbss_pg = phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2));
+		if (hdbss_pg)
+			__free_pages(hdbss_pg, HDBSSBR_SZ(vcpu->arch.hdbss.br_el2));
+	}
 }
 
 static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a9b650e757ab..a4b41a0215d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -855,6 +855,7 @@ struct kvm {
 	struct xarray mem_attr_array;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
+	bool enable_hdbss;
 };
 
 #define kvm_err(fmt, ...) \
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2da470d453df..b4206f7b66f2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1250,6 +1250,8 @@ struct kvm_ppc_resize_hpt {
 /* support userspace to request management of CSV3 shared pages */
 #define KVM_CAP_HYGON_COCO_EXT_CSV3_SP_MGR        (1 << 4)
 
+#define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502
+
 #ifdef KVM_CAP_IRQ_ROUTING
 
 struct kvm_irq_routing_irqchip {
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index e6184731588e..44fca9d98b7d 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1248,6 +1248,7 @@ struct kvm_ppc_resize_hpt {
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
+#define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
 	__u32 pin;
-- 
Gitee


From 457ec3e4931f809b5aa833b357e6f855af70d68e Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:25 +0800
Subject: [PATCH 05/59] arm64/kvm: support to handle the HDBSSF event

commit 6d60015590c3f56b0188b2409cfb3ad9e05ded00 openEuler

Updating the dirty bitmap based on the HDBSS buffer. Similar
to the implementation of the x86 pml feature, KVM flushes the
buffers on all VM-Exits, thus we only need to kick running
vCPUs to force a VM-Exit.

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/arm.c         | 10 ++++++++
 arch/arm64/kvm/handle_exit.c | 47 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/mmu.c         | 10 +++++++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c41fd1d57f53..5eab25fd328b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1741,7 +1741,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
+	/*
+	 * Flush all CPUs' dirty log buffers to the dirty_bitmap.  Called
+	 * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
+	 * on all VM-Exits, thus we only need to kick running vCPUs to force a
+	 * VM-Exit.
+	 */
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
 
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_vcpu_kick(vcpu);
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 617ae6dea5d5..f1d6456126be 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -288,6 +288,50 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
 	return arm_exit_handlers[esr_ec];
 }
 
+#define HDBSS_ENTRY_VALID_SHIFT 0
+#define HDBSS_ENTRY_VALID_MASK (1UL << HDBSS_ENTRY_VALID_SHIFT)
+#define HDBSS_ENTRY_IPA_SHIFT 12
+#define HDBSS_ENTRY_IPA_MASK GENMASK_ULL(55, HDBSS_ENTRY_IPA_SHIFT)
+
+static void kvm_flush_hdbss_buffer(struct kvm_vcpu *vcpu)
+{
+	int idx, curr_idx;
+	u64 *hdbss_buf;
+
+	if (!vcpu->kvm->enable_hdbss)
+		return;
+
+	dsb(sy);
+	isb();
+	curr_idx = HDBSSPROD_IDX(read_sysreg_s(SYS_HDBSSPROD_EL2));
+
+	/* Do nothing if HDBSS buffer is empty or br_el2 is NULL */
+	if (curr_idx == 0 || vcpu->arch.hdbss.br_el2 == 0)
+		return;
+
+	hdbss_buf = page_address(phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2)));
+	if (!hdbss_buf) {
+		kvm_err("Enter flush hdbss buffer with buffer == NULL!");
+		return;
+	}
+
+	for (idx = 0; idx < curr_idx; idx++) {
+		u64 gpa;
+
+		gpa = hdbss_buf[idx];
+		if (!(gpa & HDBSS_ENTRY_VALID_MASK))
+			continue;
+
+		gpa = gpa & HDBSS_ENTRY_IPA_MASK;
+		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+	}
+
+	/* reset HDBSS index */
+	write_sysreg_s(0, SYS_HDBSSPROD_EL2);
+	dsb(sy);
+	isb();
+}
+
 /*
  * We may be single-stepping an emulated instruction. If the emulation
  * has been completed in the kernel, we can return to userspace with a
@@ -323,6 +367,9 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
 {
 	struct kvm_run *run = vcpu->run;
 
+	if (vcpu->kvm->enable_hdbss)
+		kvm_flush_hdbss_buffer(vcpu);
+
 	if (ARM_SERROR_PENDING(exception_index)) {
 		/*
 		 * The SError is handled by handle_exit_early(). If the guest
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 70969604cf4e..38eb97d1ad1f 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1733,7 +1733,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	unsigned long fault_status;
 	phys_addr_t fault_ipa;
 	struct kvm_memory_slot *memslot;
-	unsigned long hva;
+	unsigned long hva, iss2;
 	bool is_iabt, write_fault, writable;
 	gfn_t gfn;
 	int ret, idx;
@@ -1743,6 +1743,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
 
+	/*
+	 * HDBSS buffer already flushed when enter handle_trap_exceptions().
+	 * Nothing to do here.
+	 */
+	iss2 = ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu));
+	if (fault_status == ESR_ELx_FSC_PERM && (iss2 & ESR_ELx_HDBSSF))
+		return 1;
+
 	if (fault_status == ESR_ELx_FSC_FAULT) {
 		/* Beyond sanitised PARange (which is the IPA limit) */
 		if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
-- 
Gitee


From bf2edabddce8aadcb2aa1282fda32c4ae8ce18a6 Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:26 +0800
Subject: [PATCH 06/59] arm64/config: add config to control whether enable
 HDBSS feature

commit 8d2e1e3b26a99872309a1c2afa3f0f35f1102137 openEuler

The HDBSS feature introduces new assembly registers
(HDBSSBR_EL2 and HDBSSPROD_EL2), which depends on the armv9.5-a
compilation support. So add ARM64_HDBSS config to control whether
enable the HDBSS feature.

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/Kconfig                  | 11 +++++++++++
 arch/arm64/configs/tencent.config   |  2 ++
 arch/arm64/include/asm/cpufeature.h |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  2 ++
 arch/arm64/include/asm/kvm_mmu.h    |  2 ++
 arch/arm64/include/asm/sysreg.h     |  2 ++
 arch/arm64/kvm/arm.c                |  8 ++++++++
 arch/arm64/kvm/handle_exit.c        |  5 ++++-
 arch/arm64/kvm/hyp/pgtable.c        |  4 ++++
 arch/arm64/kvm/hyp/vhe/switch.c     |  2 ++
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c  |  2 ++
 arch/arm64/kvm/mmu.c                |  4 ++++
 arch/arm64/kvm/reset.c              |  2 ++
 include/linux/kvm_host.h            |  2 ++
 14 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 2843b6be308b..fffca0ee3d89 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2306,6 +2306,17 @@ config ARM64_HAFT
 	  support this feature. If unsure, say Y.
 
 endmenu # "ARMv8.8 architectural features"
+menu "ARMv9.5 architectural features"
+
+config ARM64_HDBSS
+	bool "Enable support for Hardware Dirty state tracking Structure (HDBSS)"
+	default y
+	help
+	  Hardware Dirty state tracking Structure(HDBSS) enhances tracking
+	  translation table descriptors’ dirty state to reduce the cost of
+	  surveying for dirtied granules.
+
+endmenu # "ARMv9.5 architectural features"
 
 config ARM64_SVE
 	bool "ARM Scalable Vector Extension support"
diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config
index 3c8c2e0f024f..8f5faca80659 100644
--- a/arch/arm64/configs/tencent.config
+++ b/arch/arm64/configs/tencent.config
@@ -115,6 +115,7 @@ CONFIG_ACPI_APEI_EINJ=m
 CONFIG_ACPI_APEI_ERST_DEBUG=m
 CONFIG_ACPI_PFRUT=m
 CONFIG_ACPI_AGDI=y
+CONFIG_KVM_HISI_VIRT=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM=y
 CONFIG_LIVEPATCH=y
@@ -645,6 +646,7 @@ CONFIG_NVME_TARGET_FC=m
 CONFIG_NVME_TARGET_FCLOOP=m
 CONFIG_NVME_TARGET_TCP=m
 CONFIG_ENCLOSURE_SERVICES=m
+CONFIG_VIRT_PLAT_DEV=y
 CONFIG_EEPROM_AT24=m
 CONFIG_EEPROM_93CX6=y
 CONFIG_EEPROM_EE1004=m
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 053dfd58abf9..5801456f7f1e 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -753,6 +753,7 @@ static __always_inline bool system_supports_fpsimd(void)
 	return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD);
 }
 
+#ifdef CONFIG_ARM64_HDBSS
 static inline bool system_supports_hdbss(void)
 {
 	u64 mmfr1;
@@ -764,6 +765,7 @@ static inline bool system_supports_hdbss(void)
 
 	return val == ID_AA64MMFR1_EL1_HAFDBS_HDBSS;
 }
+#endif
 
 static inline bool system_uses_hw_pan(void)
 {
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 49dfd36f0632..aff5383a3958 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -614,11 +614,13 @@ struct kvm_vcpu_arch {
 	/* Realm meta data */
 	struct realm_rec rec;
 
+#ifdef CONFIG_ARM64_HDBSS
 	/* HDBSS registers info */
 	struct {
 		u64 br_el2;
 		u64 prod_el2;
 	} hdbss;
+#endif
 };
 
 /*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 5d90a332f2d7..5918769294ed 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -335,6 +335,7 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
 	asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
 }
 
+#ifdef CONFIG_ARM64_HDBSS
 static __always_inline void __load_hdbss(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->kvm->enable_hdbss)
@@ -346,6 +347,7 @@ static __always_inline void __load_hdbss(struct kvm_vcpu *vcpu)
 	dsb(sy);
 	isb();
 }
+#endif
 
 static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
 {
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 830862ebfd31..783b448e3e3c 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1028,6 +1028,7 @@
 
 #define PIRx_ELx_PERM(idx, perm)	((perm) << ((idx) * 4))
 
+#ifdef CONFIG_ARM64_HDBSS
 /*
  * Definitions for the HDBSS feature
  */
@@ -1039,6 +1040,7 @@
 #define HDBSSBR_SZ(br)		(((br) & HDBSSBR_EL2_SZ_MASK) >> HDBSSBR_EL2_SZ_SHIFT)
 
 #define HDBSSPROD_IDX(prod)	(((prod) & HDBSSPROD_EL2_INDEX_MASK) >> HDBSSPROD_EL2_INDEX_SHIFT)
+#endif
 
 #define ARM64_FEATURE_FIELD_BITS	4
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 5eab25fd328b..3579cbeb9874 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -80,6 +80,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 
+#ifdef CONFIG_ARM64_HDBSS
 static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 				    struct kvm_enable_cap *cap)
 {
@@ -143,6 +144,7 @@ static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 
 	return 0;
 }
+#endif
 
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			    struct kvm_enable_cap *cap)
@@ -196,9 +198,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		r = kvm_realm_enable_cap(kvm, cap);
 		mutex_unlock(&kvm->lock);
 		break;
+#ifdef CONFIG_ARM64_HDBSS
 	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
 		r = kvm_cap_arm_enable_hdbss(kvm, cap);
 		break;
+#endif
 	default:
 		r = -EINVAL;
 		break;
@@ -434,9 +438,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_RME:
 		r = static_key_enabled(&kvm_rme_is_available);
 		break;
+#ifdef CONFIG_ARM64_HDBSS
 	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
 		r = system_supports_hdbss();
 		break;
+#endif
 	default:
 		r = 0;
 	}
@@ -1741,6 +1747,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
+#ifdef CONFIG_ARM64_HDBSS
 	/*
 	 * Flush all CPUs' dirty log buffers to the dirty_bitmap.  Called
 	 * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
@@ -1752,6 +1759,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_vcpu_kick(vcpu);
+#endif
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index f1d6456126be..770d4622323e 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -288,6 +288,7 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
 	return arm_exit_handlers[esr_ec];
 }
 
+#ifdef CONFIG_ARM64_HDBSS
 #define HDBSS_ENTRY_VALID_SHIFT 0
 #define HDBSS_ENTRY_VALID_MASK (1UL << HDBSS_ENTRY_VALID_SHIFT)
 #define HDBSS_ENTRY_IPA_SHIFT 12
@@ -331,6 +332,7 @@ static void kvm_flush_hdbss_buffer(struct kvm_vcpu *vcpu)
 	dsb(sy);
 	isb();
 }
+#endif
 
 /*
  * We may be single-stepping an emulated instruction. If the emulation
@@ -367,9 +369,10 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
 {
 	struct kvm_run *run = vcpu->run;
 
+#ifdef CONFIG_ARM64_HDBSS
 	if (vcpu->kvm->enable_hdbss)
 		kvm_flush_hdbss_buffer(vcpu);
-
+#endif
 	if (ARM_SERROR_PENDING(exception_index)) {
 		/*
 		 * The SError is handled by handle_exit_early(). If the guest
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 08bd10447103..c624e6dd54df 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -713,8 +713,10 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
 	if (prot & KVM_PGTABLE_PROT_W)
 		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 
+#ifdef CONFIG_ARM64_HDBSS
 	if (prot & KVM_PGTABLE_PROT_DBM)
 		attr |= KVM_PTE_LEAF_ATTR_HI_S2_DBM;
+#endif
 
 	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
 	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
@@ -1320,8 +1322,10 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 	if (prot & KVM_PGTABLE_PROT_W)
 		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 
+#ifdef CONFIG_ARM64_HDBSS
 	if (prot & KVM_PGTABLE_PROT_DBM)
 		set |= KVM_PTE_LEAF_ATTR_HI_S2_DBM;
+#endif
 
 	if (prot & KVM_PGTABLE_PROT_X)
 		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index edf9eb81dcdb..96904c5a77ab 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -226,7 +226,9 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	 * __activate_traps clear HCR_EL2.TGE (among other things).
 	 */
 	__load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
+#ifdef CONFIG_ARM64_HDBSS
 	__load_hdbss(vcpu);
+#endif
 	__activate_traps(vcpu);
 
 	__kvm_adjust_pc(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 236d07c1b0b8..283e19127591 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -92,7 +92,9 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
 	__sysreg_restore_el1_state(guest_ctxt);
 	__mpam_guest_load();
 
+#ifdef CONFIG_ARM64_HDBSS
 	__load_hdbss(vcpu);
+#endif
 
 	vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 38eb97d1ad1f..23813b4338c4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1660,8 +1660,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
+#ifdef CONFIG_ARM64_HDBSS
 	if (kvm->enable_hdbss && logging_active)
 		prot |= KVM_PGTABLE_PROT_DBM;
+#endif
 
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
@@ -1743,6 +1745,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
 
+#ifdef CONFIG_ARM64_HDBSS
 	/*
 	 * HDBSS buffer already flushed when enter handle_trap_exceptions().
 	 * Nothing to do here.
@@ -1750,6 +1753,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	iss2 = ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu));
 	if (fault_status == ESR_ELx_FSC_PERM && (iss2 & ESR_ELx_HDBSSF))
 		return 1;
+#endif
 
 	if (fault_status == ESR_ELx_FSC_FAULT) {
 		/* Beyond sanitised PARange (which is the IPA limit) */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index c98f6bc5fcc5..a8478c505de5 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -186,11 +186,13 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kfree(vcpu->arch.ccsidr);
 	kvm_destroy_rec(vcpu);
 
+#ifdef CONFIG_ARM64_HDBSS
 	if (vcpu->arch.hdbss.br_el2) {
 		hdbss_pg = phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2));
 		if (hdbss_pg)
 			__free_pages(hdbss_pg, HDBSSBR_SZ(vcpu->arch.hdbss.br_el2));
 	}
+#endif
 }
 
 static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a4b41a0215d1..73938c9fa121 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -855,7 +855,9 @@ struct kvm {
 	struct xarray mem_attr_array;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
+#ifdef CONFIG_ARM64_HDBSS
 	bool enable_hdbss;
+#endif
 };
 
 #define kvm_err(fmt, ...) \
-- 
Gitee


From 9d49503de44d27d3ecccd92f8fc71d855c11606a Mon Sep 17 00:00:00 2001
From: eillon <yezhenyu2@huawei.com>
Date: Wed, 2 Apr 2025 15:56:27 +0800
Subject: [PATCH 07/59] arm64/kabi: use KABI_EXTEND to skip KABI check

commit 1d334a25caf930d12a66e7a346cdbacba4900910 openEuler

Use KABI_EXTEND to skip KABI check of HDBSS feature.

Signed-off-by: eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/kvm_host.h | 4 ++--
 include/linux/kvm_host.h          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index aff5383a3958..c01d74a70d34 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -616,10 +616,10 @@ struct kvm_vcpu_arch {
 
 #ifdef CONFIG_ARM64_HDBSS
 	/* HDBSS registers info */
-	struct {
+	KABI_EXTEND(struct {
 		u64 br_el2;
 		u64 prod_el2;
-	} hdbss;
+	} hdbss)
 #endif
 };
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 73938c9fa121..c19e08bc0fc0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -856,7 +856,7 @@ struct kvm {
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
 #ifdef CONFIG_ARM64_HDBSS
-	bool enable_hdbss;
+	KABI_EXTEND(bool enable_hdbss)
 #endif
 };
 
-- 
Gitee


From 2371cb7fa6124a4d03393887adafb0f6305d91e7 Mon Sep 17 00:00:00 2001
From: Jinqian Yang <yangjinqian1@huawei.com>
Date: Tue, 26 Aug 2025 13:57:56 +0800
Subject: [PATCH 08/59] KVM: arm64: do not support hdbss in nvhe

commit a5e3c939fbb1b9b780122570e196824f1be494ce openEuler

In nVHE mode, after hdbss size is configured, the host still reports
call trace messages duringlive migration.

The modification in this commit ensures that after configuring the
hdbss size in nvhe, the host will report nvhe does not support hdbss
during live migration.

Fixes: ("arm64/kvm: using ioctl to enable/disable the HDBSS feature")
Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Eillon <yezhenyu2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/arm.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3579cbeb9874..e09afdf96fcc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -94,6 +94,11 @@ static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 		return -EINVAL;
 	}
 
+	if (!is_kernel_in_hyp_mode()) {
+		kvm_err("Do not support HDBSS in non-VHE mode!\n");
+		return -EINVAL;
+	}
+
 	if (size < 0 || size > HDBSS_MAX_SIZE) {
 		kvm_err("Invalid HDBSS buffer size: %d!\n", size);
 		return -EINVAL;
-- 
Gitee


From 3cb7edf13f2fefbf0c0b34f57d9985768e0bc097 Mon Sep 17 00:00:00 2001
From: Jinqian Yang <yangjinqian1@huawei.com>
Date: Tue, 26 Aug 2025 13:57:57 +0800
Subject: [PATCH 09/59] KVM: arm64: fix memory leak in HDBSS

commit 00c93102b328bfe49e863eac41a86cf5cec4ba4e openEuler

Allocate hdbss page for each vcpu. If the allocation fails midway,
the previously allocated page will not be released, causing a memory
leak. This patch solves this problem.

Fixes: ("arm64/kvm: using ioctl to enable/disable the HDBSS feature")
Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/arm.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e09afdf96fcc..9f8174d52e24 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -81,6 +81,29 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 }
 
 #ifdef CONFIG_ARM64_HDBSS
+static void kvm_clear_hdbss(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
+	struct page *hdbss_pg;
+
+	kvm->arch.vtcr &= ~(VTCR_EL2_HD | VTCR_EL2_HDBSS);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/* Kick vcpus to flush hdbss buffer. */
+		kvm_vcpu_kick(vcpu);
+
+		hdbss_pg = phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2));
+		if (hdbss_pg)
+			__free_pages(hdbss_pg, HDBSSBR_SZ(vcpu->arch.hdbss.br_el2));
+
+		vcpu->arch.hdbss.br_el2 = 0;
+		vcpu->arch.hdbss.prod_el2 = 0;
+	}
+
+	kvm->enable_hdbss = false;
+}
+
 static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 				    struct kvm_enable_cap *cap)
 {
@@ -113,6 +136,7 @@ static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 			hdbss_pg = alloc_pages(GFP_KERNEL, size);
 			if (!hdbss_pg) {
 				kvm_err("Alloc HDBSS buffer failed!\n");
+				kvm_clear_hdbss(kvm);
 				return -EINVAL;
 			}
 
@@ -129,21 +153,7 @@ static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
 
 		kvm_info("Enable HDBSS success, HDBSS buffer size: %d\n", size);
 	} else if (kvm->enable_hdbss) {
-		kvm->arch.vtcr &= ~(VTCR_EL2_HD | VTCR_EL2_HDBSS);
-
-		kvm_for_each_vcpu(i, vcpu, kvm) {
-			/* Kick vcpus to flush hdbss buffer. */
-			kvm_vcpu_kick(vcpu);
-
-			hdbss_pg = phys_to_page(HDBSSBR_BADDR(vcpu->arch.hdbss.br_el2));
-			if (hdbss_pg)
-				__free_pages(hdbss_pg, HDBSSBR_SZ(vcpu->arch.hdbss.br_el2));
-
-			vcpu->arch.hdbss.br_el2 = 0;
-			vcpu->arch.hdbss.prod_el2 = 0;
-		}
-
-		kvm->enable_hdbss = false;
+		kvm_clear_hdbss(kvm);
 		kvm_info("Disable HDBSS success\n");
 	}
 
-- 
Gitee


From c4e3570d731ef2d0adfb0c48ebe9b1a158bcda61 Mon Sep 17 00:00:00 2001
From: dorso <2434317248@qq.com>
Date: Tue, 13 Jan 2026 15:13:21 +0800
Subject: [PATCH 10/59] HAOC: Support ARM64 Page Table Protection.(PTP).

Implement page table isolation and protection between kernel and
user address spaces when CONFIG_PTP is enabled.

Signed-off-by: dorsoli <dorsoli@tencent.com>
---
 arch/arm64/include/asm/fixmap.h            |    4 +
 arch/arm64/include/asm/haoc/haoc-bitmap.h  |   78 +
 arch/arm64/include/asm/haoc/haoc-def.h     |   16 +
 arch/arm64/include/asm/haoc/haoc.h         |   23 +
 arch/arm64/include/asm/haoc/iee-fixmap.h   |   23 +
 arch/arm64/include/asm/haoc/iee-func.h     |    4 +
 arch/arm64/include/asm/haoc/iee-init.h     |    5 +
 arch/arm64/include/asm/haoc/iee-mmu.h      |   13 +
 arch/arm64/include/asm/haoc/iee-pgtable.h  | 1727 ++++++++++++++++++++
 arch/arm64/include/asm/haoc/iee-ptp-init.h |   58 +
 arch/arm64/include/asm/haoc/iee.h          |    1 +
 arch/arm64/include/asm/pgtable.h           |    6 +
 arch/arm64/include/asm/tlb.h               |    7 +
 arch/arm64/kernel/cpufeature.c             |   10 +
 arch/arm64/kernel/haoc/Kconfig             |   14 +
 arch/arm64/kernel/haoc/Makefile            |    5 +-
 arch/arm64/kernel/haoc/haoc-bitmap.c       |  202 +++
 arch/arm64/kernel/haoc/haoc.c              |   17 +
 arch/arm64/kernel/haoc/iee/iee-func.c      |   75 +-
 arch/arm64/kernel/haoc/iee/iee-init.c      |    3 +-
 arch/arm64/kernel/haoc/iee/iee-mmu.c       |  397 ++++-
 arch/arm64/kernel/haoc/iee/iee-token.c     |   14 +-
 arch/arm64/kernel/haoc/ptp/Makefile        |    1 +
 arch/arm64/kernel/haoc/ptp/iee-ptp-init.c  |  406 +++++
 arch/arm64/kernel/haoc/ptp/ptp.c           |  280 ++++
 arch/arm64/kernel/setup.c                  |    5 +
 arch/arm64/kernel/vmlinux.lds.S            |   12 +
 arch/arm64/mm/fault.c                      |    7 +
 arch/arm64/mm/fixmap.c                     |   35 +
 arch/arm64/mm/mmu.c                        |   61 +-
 arch/arm64/mm/pgd.c                        |   17 +
 arch/arm64/mm/trans_pgd.c                  |    7 +
 drivers/firmware/efi/arm-runtime.c         |    7 +
 drivers/tty/serial/earlycon.c              |    7 +
 drivers/usb/early/ehci-dbgp.c              |    7 +
 include/linux/efi.h                        |    3 +
 include/linux/mm.h                         |   20 +-
 mm/debug_vm_pgtable.c                      |   14 +
 mm/early_ioremap.c                         |   28 +
 mm/haoc/ptp-pg_cache.c                     |    2 +-
 mm/huge_memory.c                           |   16 +
 mm/sparse-vmemmap.c                        |   49 +-
 42 files changed, 3669 insertions(+), 17 deletions(-)
 create mode 100644 arch/arm64/include/asm/haoc/haoc-bitmap.h
 create mode 100644 arch/arm64/include/asm/haoc/iee-fixmap.h
 create mode 100644 arch/arm64/include/asm/haoc/iee-pgtable.h
 create mode 100644 arch/arm64/include/asm/haoc/iee-ptp-init.h
 create mode 100644 arch/arm64/kernel/haoc/haoc-bitmap.c
 create mode 100644 arch/arm64/kernel/haoc/ptp/Makefile
 create mode 100644 arch/arm64/kernel/haoc/ptp/iee-ptp-init.c
 create mode 100644 arch/arm64/kernel/haoc/ptp/ptp.c

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 58c294a96676..85d92fb42024 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -109,6 +109,10 @@ void __init fixmap_copy(pgd_t *pgdir);
 
 extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot);
 
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-ptp-init.h>
+#endif
+
 #include <asm-generic/fixmap.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/haoc/haoc-bitmap.h b/arch/arm64/include/asm/haoc/haoc-bitmap.h
new file mode 100644
index 000000000000..1e6926afaa2a
--- /dev/null
+++ b/arch/arm64/include/asm/haoc/haoc-bitmap.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HAOC_BITMAP_H
+#define _LINUX_HAOC_BITMAP_H
+
+#include <linux/mm.h>
+#include <asm/haoc/haoc-def.h>
+#include <linux/types.h>
+
+#define HAOC_BITMAP_START	VMEMMAP_END
+#define haoc_bitmap_base 	((uint8_t *)HAOC_BITMAP_START - (memstart_addr >> PAGE_SHIFT))
+
+#define __pfn_to_haoc_bitmap(pfn)	(haoc_bitmap_base + (pfn))
+
+#define __va_to_haoc_bitmap(va)	({	\
+	uint8_t *__ret;					\
+	if (__is_lm_address((u64)va))	\
+		__ret = __pfn_to_haoc_bitmap(PHYS_PFN(__pa(va)));		\
+	else						\
+		__ret = __pfn_to_haoc_bitmap(PHYS_PFN(__pa_symbol(va)));	\
+	__ret;						\
+})
+
+/* HAOC_NORMAL means */
+enum HAOC_BITMAP_TYPE {
+	IEE_NORMAL = 0,		/* Non-IEE pages */
+	IEE_DATA,
+	IEE_PGTABLE,
+	IEE_CRED,
+	IEE_KEY,
+	IEE_SELINUX,
+};
+
+extern bool haoc_bitmap_ready;
+
+extern int haoc_bitmap_sparse_init(void);
+extern void haoc_bitmap_setup(void);
+extern void setup_iee_early_data_bitmap(void);
+
+static inline enum HAOC_BITMAP_TYPE iee_get_bitmap_type(unsigned long va)
+{
+	return *__va_to_haoc_bitmap(va);
+}
+
+void _iee_set_bitmap_type(unsigned long __unused,
+				u64 va, enum HAOC_BITMAP_TYPE type, int num_pages);
+
+static inline void iee_set_bitmap_type(unsigned long va,
+					int num_pages, enum HAOC_BITMAP_TYPE type)
+{
+	// iee_rw_gate(IEE_OP_SET_BITMAP_TYPE, va, type, num_pages);
+}
+
+static inline void iee_verify_type(unsigned long va, enum HAOC_BITMAP_TYPE type,
+					const char *name)
+{
+	// uint8_t bit_type = iee_get_bitmap_type(va);
+
+	// if (unlikely(bit_type != type))
+	// 	pr_err("IEE detected type: %d, fake %s: va(0x%lx)", bit_type, name, va);
+}
+
+static inline void iee_verify_not_normal(void *start, void *end)
+{
+	// unsigned long addr;
+
+	// if (!haoc_bitmap_ready)
+	// 	return;
+	// addr = start;
+	// while (addr < end) {
+	// 	if (unlikely(iee_get_bitmap_type(addr) == IEE_NORMAL)) {
+	// 		pr_err("IEE Detected data type normal, va(0x%lx)", addr);
+	// 		return;
+	// 	}
+	// 	addr += PAGE_SIZE;
+	// }
+}
+
+#endif
diff --git a/arch/arm64/include/asm/haoc/haoc-def.h b/arch/arm64/include/asm/haoc/haoc-def.h
index 8563eeafacf1..833f4a25ea97 100644
--- a/arch/arm64/include/asm/haoc/haoc-def.h
+++ b/arch/arm64/include/asm/haoc/haoc-def.h
@@ -21,6 +21,7 @@ enum {
 	IEE_OP_INVALIDATE_TOKEN,
 	IEE_OP_VALIDATE_TOKEN,
 #endif
+	IEE_OP_SET_BITMAP_TYPE,
 #ifdef CONFIG_CREDP
 	IEE_OP_COPY_CRED,
 	IEE_OP_SET_CRED_UID,
@@ -51,6 +52,21 @@ enum {
 	IEE_OP_SET_CRED_SECURITY,
 	IEE_OP_SET_CRED_RCU,
 	IEE_OP_SET_CRED_UCOUNTS,
+#endif
+#ifdef CONFIG_PTP
+	IEE_OP_SET_TRAMP_PGD,
+	IEE_OP_SET_BM_PTE,
+	IEE_OP_SET_PTE,
+	IEE_OP_SET_PMD,
+	IEE_OP_SET_PUD,
+	IEE_OP_SET_P4D,
+	IEE_OP_SET_SWAPPER_PGD,
+	IEE_OP_SET_XCHG,
+	IEE_OP_SET_PMD_XCHG,
+	IEE_OP_SET_CMPXCHG,
+	IEE_OP_SET_PMD_CMPXCHG,
+	IEE_OP_SET_SENSITIVE_PTE,
+	IEE_OP_UNSET_SENSITIVE_PTE,
 #endif
 	IEE_FLAG_END
 };
diff --git a/arch/arm64/include/asm/haoc/haoc.h b/arch/arm64/include/asm/haoc/haoc.h
index 86c5e4286518..9c4beb011248 100644
--- a/arch/arm64/include/asm/haoc/haoc.h
+++ b/arch/arm64/include/asm/haoc/haoc.h
@@ -76,4 +76,27 @@ void _iee_set_cred_rcu(unsigned long __unused, struct cred *cred, struct rcu_hea
 void _iee_set_cred_ucounts(unsigned long __unused, struct cred *cred,
 			struct ucounts *ucounts);
 #endif
+
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_PTP
+void __iee_code _iee_set_static_pgd(int flag, pgd_t *pgdp, pgd_t pgd);
+void __iee_code _iee_set_bm_pte(int flag, pte_t *ptep, pte_t pte);
+void __iee_code _iee_set_pte(int flag, pte_t *ptep, pte_t pte);
+void __iee_code _iee_set_pmd(int flag, pmd_t *pmdp, pmd_t pmd);
+void __iee_code _iee_set_pud(int flag, pud_t *pudp, pud_t pud);
+void __iee_code _iee_set_p4d(int flag, p4d_t *p4dp, p4d_t p4d);
+void __iee_code _iee_set_swapper_pgd(int flag, pgd_t *pgdp, pgd_t pgd);
+pteval_t __iee_code _iee_set_xchg_relaxed(int flag, pte_t *ptep, pteval_t pteval);
+pmdval_t __iee_code _iee_set_pmd_xchg_relaxed(int flag, pmd_t *pmdp, pmdval_t pmdval);
+pteval_t __iee_code _iee_set_cmpxchg_relaxed(int flag, pte_t *ptep, pteval_t old_pteval,
+		pteval_t new_pteval);
+pmdval_t __iee_code _iee_set_pmd_cmpxchg_relaxed(int flag, pmd_t *pmdp,
+		pmdval_t old_pmdval, pmdval_t new_pmdval);
+void __iee_code _iee_set_sensitive_pte(int flag, pte_t *lm_ptep, pte_t *iee_ptep,
+		int order, int use_block_pmd);
+void __iee_code _iee_unset_sensitive_pte(int flag, pte_t *lm_ptep, pte_t *iee_ptep,
+		int order, int use_block_pmd);
+#endif
+
 #endif
diff --git a/arch/arm64/include/asm/haoc/iee-fixmap.h b/arch/arm64/include/asm/haoc/iee-fixmap.h
new file mode 100644
index 000000000000..628ff4a5c687
--- /dev/null
+++ b/arch/arm64/include/asm/haoc/iee-fixmap.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IEE_FIXMAP_H
+#define _LINUX_IEE_FIXMAP_H
+
+#include <asm/fixmap.h>
+
+#ifndef clear_fixmap_pre_init
+#define clear_fixmap_pre_init(idx)			\
+	__iee_set_fixmap_pre_init(idx, 0, FIXMAP_PAGE_CLEAR)
+#endif
+
+#define __iee_set_fixmap_offset_pre_init(idx, phys, flags)				\
+({									\
+	unsigned long ________addr;					\
+	__iee_set_fixmap_pre_init(idx, phys, flags);					\
+	________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));	\
+	________addr;							\
+})
+
+#define iee_set_fixmap_offset_pre_init(idx, phys) \
+	__iee_set_fixmap_offset_pre_init(idx, phys, FIXMAP_PAGE_NORMAL)
+
+#endif
diff --git a/arch/arm64/include/asm/haoc/iee-func.h b/arch/arm64/include/asm/haoc/iee-func.h
index 2c57ea4aff81..8a37ff84a3d0 100644
--- a/arch/arm64/include/asm/haoc/iee-func.h
+++ b/arch/arm64/include/asm/haoc/iee-func.h
@@ -10,9 +10,13 @@
 #ifndef _LINUX_IEE_ASM_FUNC_H
 #define _LINUX_IEE_ASM_FUNC_H
 
+#include <asm/haoc/haoc-bitmap.h>
+
 extern void set_iee_address(unsigned long addr, unsigned int order, bool valid);
+extern void set_iee_address_valid(unsigned long lm_addr, unsigned int order);
 extern void iee_set_logical_mem(unsigned long addr, unsigned int order, bool prot);
 extern void put_pages_into_iee(unsigned long addr, int order);
+extern void remove_pages_from_iee(unsigned long addr, int order);
 extern void set_iee_page(unsigned long addr, int order);
 extern void unset_iee_page(unsigned long addr, int order);
 
diff --git a/arch/arm64/include/asm/haoc/iee-init.h b/arch/arm64/include/asm/haoc/iee-init.h
index 31467199c11a..515b4b334524 100644
--- a/arch/arm64/include/asm/haoc/iee-init.h
+++ b/arch/arm64/include/asm/haoc/iee-init.h
@@ -16,5 +16,10 @@
 
 extern char iee_init_data_begin[];
 extern char iee_init_data_end[];
+extern char __iee_ptp_data_start[];
+extern char __iee_ptp_data_end[];
+
+extern spinlock_t swapper_pgdir_lock;
+extern struct mutex fixmap_lock;
 
 #endif
diff --git a/arch/arm64/include/asm/haoc/iee-mmu.h b/arch/arm64/include/asm/haoc/iee-mmu.h
index bc43314bd344..d921b58ac035 100644
--- a/arch/arm64/include/asm/haoc/iee-mmu.h
+++ b/arch/arm64/include/asm/haoc/iee-mmu.h
@@ -10,6 +10,10 @@
 #ifndef _LINUX_IEE_MMU_H
 #define _LINUX_IEE_MMU_H
 
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-ptp-init.h>
+#endif
+
 extern phys_addr_t __init early_iee_stack_alloc(int order);
 extern phys_addr_t __init early_iee_data_alloc(int shift);
 extern void __iee_create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
@@ -17,6 +21,15 @@ extern void __iee_create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 				 pgprot_t prot,
 				 phys_addr_t (*pgtable_alloc)(int),
 				 int flags);
+extern phys_addr_t __init early_iee_pgtable_alloc(int shift);
+extern phys_addr_t __init early_pgtable_alloc(int shift);
+extern phys_addr_t __pgd_pgtable_alloc(int shift);
+extern phys_addr_t pgd_pgtable_alloc(int shift);
+extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int),
+				 int flags);
 extern void __init iee_init_mappings(pgd_t *pgdp);
 extern void __init init_early_iee_data(void);
 extern void __init early_iee_data_cache_init(void);
diff --git a/arch/arm64/include/asm/haoc/iee-pgtable.h b/arch/arm64/include/asm/haoc/iee-pgtable.h
new file mode 100644
index 000000000000..57f6aa4b66ac
--- /dev/null
+++ b/arch/arm64/include/asm/haoc/iee-pgtable.h
@@ -0,0 +1,1727 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_IEE_PGTABLE_H
+#define __ASM_IEE_PGTABLE_H
+
+#include <asm/bug.h>
+#include <asm/proc-fns.h>
+
+#include <asm/memory.h>
+#include <asm/mte.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/pgtable-prot.h>
+#include <asm/tlbflush.h>
+/*
+ * VMALLOC range.
+ *
+ * VMALLOC_START: beginning of the kernel vmalloc space
+ * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space
+ *	and fixed mappings
+ */
+#define VMALLOC_START		(MODULES_END)
+#define VMALLOC_END		(VMEMMAP_START - SZ_256M)
+
+#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+
+#ifndef __ASSEMBLY__
+
+#include <asm/cmpxchg.h>
+#include <asm/fixmap.h>
+#include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/page_table_check.h>
+#include <asm/haoc/iee-fixmap.h>
+#include <asm/haoc/haoc-def.h>
+
+extern bool haoc_enabled;
+extern pgd_t tramp_pg_dir[];
+extern pgd_t idmap_pg_dir[];
+
+extern int __pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+static inline bool in_swapper_pgdir(void *addr);
+static void iee_set_swapper_pgd_pre_init(pgd_t *pgdp, pgd_t pgd);
+
+static inline bool in_tramp_pgdir(void *addr)
+{
+	return ((unsigned long)addr & PAGE_MASK) ==
+			((unsigned long)tramp_pg_dir & PAGE_MASK);
+}
+
+static inline bool in_idmap_pgdir(void *addr)
+{
+	return ((unsigned long)addr & PAGE_MASK) ==
+			((unsigned long)idmap_pg_dir & PAGE_MASK);
+}
+
+static inline void iee_set_pte_pre_init(pte_t *ptep, pte_t pte)
+{
+	WRITE_ONCE(*ptep, pte);
+
+	/*
+	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
+	 * or update_mmu_cache() have the necessary barriers.
+	 */
+	dsb(ishst);
+	isb();
+}
+
+static inline void iee_set_pmd_pre_init(pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef __PAGETABLE_PMD_FOLDED
+	if (in_swapper_pgdir(pmdp)) {
+		iee_set_swapper_pgd_pre_init((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
+		return;
+	}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+	WRITE_ONCE(*pmdp, pmd);
+
+	dsb(ishst);
+	isb();
+}
+
+static inline void iee_set_pud_pre_init(pud_t *pudp, pud_t pud)
+{
+	#ifdef __PAGETABLE_PUD_FOLDED
+	if (in_swapper_pgdir(pudp)) {
+		iee_set_swapper_pgd_pre_init((pgd_t *)pudp, __pgd(pud_val(pud)));
+		return;
+	}
+	#endif
+
+	WRITE_ONCE(*pudp, pud);
+
+	dsb(ishst);
+	isb();
+}
+
+static inline void __maybe_unused iee_set_p4d_pre_init(p4d_t *p4dp, p4d_t p4d)
+{
+	if (in_swapper_pgdir(p4dp)) {
+		iee_set_swapper_pgd_pre_init((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
+		return;
+	}
+
+	WRITE_ONCE(*p4dp, p4d);
+	dsb(ishst);
+	isb();
+}
+
+static inline void iee_set_pgd_pre_init(pgd_t *pgdp, pgd_t pgd)
+{
+	if (in_swapper_pgdir(pgdp)) {
+		iee_set_swapper_pgd_pre_init(pgdp, __pgd(pgd_val(pgd)));
+		return;
+	}
+
+	WRITE_ONCE(*pgdp, pgd);
+	dsb(ishst);
+	isb();
+}
+
+#define pte_set_fixmap_pre_init(addr)	\
+	((pte_t *)iee_set_fixmap_offset_pre_init(FIX_PTE, addr))
+#define pte_set_fixmap_offset_pre_init(pmd, addr)	\
+	pte_set_fixmap_pre_init(pte_offset_phys(pmd, addr))
+#define pte_clear_fixmap_pre_init()		clear_fixmap_pre_init(FIX_PTE)
+
+#define pmd_set_fixmap_pre_init(addr)	\
+	((pmd_t *)iee_set_fixmap_offset_pre_init(FIX_PMD, addr))
+#define pmd_set_fixmap_offset_pre_init(pud, addr)	\
+	pmd_set_fixmap_pre_init(pmd_offset_phys(pud, addr))
+#define pmd_clear_fixmap_pre_init()		clear_fixmap_pre_init(FIX_PMD)
+
+#define pud_set_fixmap_pre_init(addr)	\
+	((pud_t *)iee_set_fixmap_offset_pre_init(FIX_PUD, addr))
+#define pud_set_fixmap_offset_pre_init(p4d, addr)\
+	pud_set_fixmap_pre_init(pud_offset_phys(p4d, addr))
+#define pud_clear_fixmap_pre_init()		clear_fixmap_pre_init(FIX_PUD)
+
+#define pgd_set_fixmap_pre_init(addr)	\
+	((pgd_t *)iee_set_fixmap_offset_pre_init(FIX_PGD, addr))
+#define pgd_clear_fixmap_pre_init()		clear_fixmap_pre_init(FIX_PGD)
+
+static void iee_set_swapper_pgd_pre_init(pgd_t *pgdp, pgd_t pgd)
+{
+	WRITE_ONCE(*pgdp, pgd);
+	dsb(ishst);
+	isb();
+}
+
+static inline pteval_t iee_set_xchg_relaxed(pte_t *ptep, pteval_t pteval)
+{
+	pteval_t ret;
+
+	ret = iee_rw_gate(IEE_OP_SET_XCHG, ptep, pteval);
+	return (pteval_t)ret;
+}
+
+static inline pmdval_t iee_set_pmd_xchg_relaxed(pmd_t *pmdp, pmdval_t pmdval)
+{
+	pmdval_t ret;
+
+	ret = iee_rw_gate(IEE_OP_SET_PMD_XCHG, pmdp, pmdval);
+	return (pmdval_t)ret;
+}
+
+static inline pteval_t iee_set_cmpxchg_relaxed(pte_t *ptep, pteval_t old_pteval,
+			pteval_t new_pteval)
+{
+	pteval_t ret;
+
+	ret = iee_rw_gate(IEE_OP_SET_CMPXCHG, ptep, old_pteval, new_pteval);
+	return ret;
+}
+
+static inline pmdval_t iee_set_pmd_cmpxchg_relaxed(pmd_t *pmdp, pmdval_t old_pmdval,
+			pmdval_t new_pmdval)
+{
+	pmdval_t ret;
+
+	ret = iee_rw_gate(IEE_OP_SET_PMD_CMPXCHG, pmdp, old_pmdval, new_pmdval);
+	return ret;
+}
+
+static inline void iee_set_static_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	iee_rw_gate(IEE_OP_SET_TRAMP_PGD, pgdp, pgd);
+}
+
+static inline void iee_set_bm_pte(pte_t *ptep, pte_t pte)
+{
+	iee_rw_gate(IEE_OP_SET_BM_PTE, ptep, pte);
+
+	/*
+	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
+	 * or update_mmu_cache() have the necessary barriers.
+	 */
+	dsb(ishst);
+	isb();
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
+
+/* Set stride and tlb_level in flush_*_tlb_range */
+#define flush_pmd_tlb_range(vma, addr, end)	\
+	__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
+#define flush_pud_tlb_range(vma, addr, end)	\
+	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * Outside of a few very special situations (e.g. hibernation), we always
+ * use broadcast TLB invalidation instructions, therefore a spurious page
+ * fault on one CPU which has been handled concurrently by another CPU
+ * does not need to perform additional invalidation.
+ */
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr)	phys_to_page(__pa_symbol(empty_zero_page))
+
+#define pte_ERROR(e)	\
+	pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))
+
+/*
+ * Macros to convert between a physical address and its placement in a
+ * page table entry, taking care of 52-bit addresses.
+ */
+#ifdef CONFIG_ARM64_PA_BITS_52
+static inline phys_addr_t __pte_to_phys(pte_t pte)
+{
+	return (pte_val(pte) & PTE_ADDR_LOW) |
+		((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
+}
+static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
+{
+	return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PTE_ADDR_MASK;
+}
+#else
+#define __pte_to_phys(pte)	(pte_val(pte) & PTE_ADDR_MASK)
+#define __phys_to_pte_val(phys)	(phys)
+#endif
+
+#define pte_pfn(pte)		(__pte_to_phys(pte) >> PAGE_SHIFT)
+#define pfn_pte(pfn,prot)	\
+	__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+
+#define pte_none(pte)		(!pte_val(pte))
+#define __pte_clear(mm, addr, ptep) \
+				__set_pte(ptep, __pte(0))
+#define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
+
+/*
+ * The following only work if pte_present(). Undefined behaviour otherwise.
+ */
+#define pte_present(pte)	(!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)))
+#define pte_young(pte)		(!!(pte_val(pte) & PTE_AF))
+#define pte_special(pte)	(!!(pte_val(pte) & PTE_SPECIAL))
+#define pte_write(pte)		(!!(pte_val(pte) & PTE_WRITE))
+#define pte_rdonly(pte)		(!!(pte_val(pte) & PTE_RDONLY))
+#define pte_user(pte)		(!!(pte_val(pte) & PTE_USER))
+#define pte_user_exec(pte)	(!(pte_val(pte) & PTE_UXN))
+#define pte_cont(pte)		(!!(pte_val(pte) & PTE_CONT))
+#define pte_devmap(pte)		(!!(pte_val(pte) & PTE_DEVMAP))
+#define pte_tagged(pte)		((pte_val(pte) & PTE_ATTRINDX_MASK) == \
+				 PTE_ATTRINDX(MT_NORMAL_TAGGED))
+
+#define pte_cont_addr_end(addr, end)						\
+({	unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK;	\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);			\
+})
+
+#define pmd_cont_addr_end(addr, end)						\
+({	unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK;	\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);			\
+})
+
+#define pte_hw_dirty(pte)	(pte_write(pte) && !pte_rdonly(pte))
+#define pte_sw_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
+#define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))
+
+#define pte_valid(pte)		(!!(pte_val(pte) & PTE_VALID))
+/*
+ * Execute-only user mappings do not have the PTE_USER bit set. All valid
+ * kernel mappings have the PTE_UXN bit set.
+ */
+#define pte_valid_not_user(pte) \
+	((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
+/*
+ * Returns true if the pte is valid and has the contiguous bit set.
+ */
+#define pte_valid_cont(pte)	(pte_valid(pte) && pte_cont(pte))
+/*
+ * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
+ * so that we don't erroneously return false for pages that have been
+ * remapped as PROT_NONE but are yet to be flushed from the TLB.
+ * Note that we can't make any assumptions based on the state of the access
+ * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the
+ * TLB.
+ */
+#define pte_accessible(mm, pte)	\
+	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
+
+/*
+ * p??_access_permitted() is true for valid user mappings (PTE_USER
+ * bit set, subject to the write permission check). For execute-only
+ * mappings, like PROT_EXEC with EPAN (both PTE_USER and PTE_UXN bits
+ * not set) must return false. PROT_NONE mappings do not have the
+ * PTE_VALID bit set.
+ */
+#define pte_access_permitted(pte, write) \
+	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
+#define pmd_access_permitted(pmd, write) \
+	(pte_access_permitted(pmd_pte(pmd), (write)))
+#define pud_access_permitted(pud, write) \
+	(pte_access_permitted(pud_pte(pud), (write)))
+
+static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
+{
+	pte_val(pte) &= ~pgprot_val(prot);
+	return pte;
+}
+
+static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
+{
+	pte_val(pte) |= pgprot_val(prot);
+	return pte;
+}
+
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+	pmd_val(pmd) &= ~pgprot_val(prot);
+	return pmd;
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+	pmd_val(pmd) |= pgprot_val(prot);
+	return pmd;
+}
+
+static inline pte_t pte_mkwrite_novma(pte_t pte)
+{
+	pte = set_pte_bit(pte, __pgprot(PTE_WRITE));
+	pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+	return pte;
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+	return pte;
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+	if (pte_write(pte))
+		pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+	return pte;
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	/*
+	 * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
+	 * clear), set the PTE_DIRTY bit.
+	 */
+	if (pte_hw_dirty(pte))
+		pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+	pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
+	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
+	return pte;
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(PTE_AF));
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(PTE_AF));
+}
+
+static inline pte_t pte_mkcont(pte_t pte)
+{
+	pte = set_pte_bit(pte, __pgprot(PTE_CONT));
+	return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
+}
+
+static inline pte_t pte_mknoncont(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(PTE_CONT));
+}
+
+static inline pte_t pte_mkpresent(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(PTE_VALID));
+}
+
+static inline pmd_t pmd_mkcont(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
+}
+
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
+}
+
+static inline void __set_pte(pte_t *ptep, pte_t pte)
+{
+	if (haoc_enabled){
+		iee_rw_gate(IEE_OP_SET_PTE, ptep, pte);
+		dsb(ishst);
+		isb();
+	} else {
+		WRITE_ONCE(*ptep, pte);
+
+		/*
+		* Only if the new pte is valid and kernel, otherwise TLB maintenance
+		* or update_mmu_cache() have the necessary barriers.
+		*/
+		if (pte_valid_not_user(pte)) {
+			dsb(ishst);
+			isb();
+		}
+	}
+}
+
+static inline pte_t __ptep_get(pte_t *ptep)
+{
+	return READ_ONCE(*ptep);
+}
+
+extern void __sync_icache_dcache(pte_t pteval);
+bool pgattr_change_is_safe(u64 old, u64 new);
+
+/*
+ * PTE bits configuration in the presence of hardware Dirty Bit Management
+ * (PTE_WRITE == PTE_DBM):
+ *
+ * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
+ *   0      0      |   1           0          0
+ *   0      1      |   1           1          0
+ *   1      0      |   1           0          1
+ *   1      1      |   0           1          x
+ *
+ * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
+ * the page fault mechanism. Checking the dirty status of a pte becomes:
+ *
+ *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
+ */
+
+static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
+					   pte_t pte)
+{
+	pte_t old_pte;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_VM))
+		return;
+
+	old_pte = __ptep_get(ptep);
+
+	if (!pte_valid(old_pte) || !pte_valid(pte))
+		return;
+	if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1)
+		return;
+
+	/*
+	 * Check for potential race with hardware updates of the pte
+	 * (__ptep_set_access_flags safely changes valid ptes without going
+	 * through an invalid entry).
+	 */
+	VM_WARN_ONCE(!pte_young(pte),
+		     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
+		     __func__, pte_val(old_pte), pte_val(pte));
+	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
+		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
+		     __func__, pte_val(old_pte), pte_val(pte));
+	VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
+		     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
+		     __func__, pte_val(old_pte), pte_val(pte));
+}
+
+static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
+{
+	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
+		__sync_icache_dcache(pte);
+
+	/*
+	 * If the PTE would provide user space access to the tags associated
+	 * with it then ensure that the MTE tags are synchronised.  Although
+	 * pte_access_permitted() returns false for exec only mappings, they
+	 * don't expose tags (instruction fetches don't check tags).
+	 */
+	if (system_supports_mte() && pte_access_permitted(pte, false) &&
+	    !pte_special(pte) && pte_tagged(pte))
+		mte_sync_tags(pte, nr_pages);
+}
+
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+
+	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+#define pte_advance_pfn pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
+{
+	return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
+}
+
+static inline void __set_ptes(struct mm_struct *mm,
+			      unsigned long __always_unused addr,
+			      pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	page_table_check_ptes_set(mm, ptep, pte, nr);
+	__sync_cache_and_tags(pte, nr);
+
+	for (;;) {
+		__check_safe_pte_update(mm, ptep, pte);
+		__set_pte(ptep, pte);
+		if (--nr == 0)
+			break;
+		ptep++;
+		pte = pte_advance_pfn(pte, 1);
+	}
+}
+
+/*
+ * Huge pte definitions.
+ */
+#define pte_mkhuge(pte)		(__pte(pte_val(pte) & ~PTE_TABLE_BIT))
+
+/*
+ * Hugetlb definitions.
+ */
+#define HUGE_MAX_HSTATE		4
+#define HPAGE_SHIFT		PMD_SHIFT
+#define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
+#define HPAGE_MASK		(~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+
+static inline pte_t pgd_pte(pgd_t pgd)
+{
+	return __pte(pgd_val(pgd));
+}
+
+static inline pte_t p4d_pte(p4d_t p4d)
+{
+	return __pte(p4d_val(p4d));
+}
+
+static inline pte_t pud_pte(pud_t pud)
+{
+	return __pte(pud_val(pud));
+}
+
+static inline pud_t pte_pud(pte_t pte)
+{
+	return __pud(pte_val(pte));
+}
+
+static inline pmd_t pud_pmd(pud_t pud)
+{
+	return __pmd(pud_val(pud));
+}
+
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+	return __pte(pmd_val(pmd));
+}
+
+static inline pmd_t pte_pmd(pte_t pte)
+{
+	return __pmd(pte_val(pte));
+}
+
+static inline pgprot_t mk_pud_sect_prot(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT);
+}
+
+static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
+{
+	return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT);
+}
+
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+	return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
+}
+
+static inline int pte_swp_exclusive(pte_t pte)
+{
+	return pte_val(pte) & PTE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+	return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * See the comment in include/linux/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pte_protnone(pmd_pte(pmd));
+}
+#endif
+
+#define pmd_present_invalid(pmd)     (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))
+
+static inline int pmd_present(pmd_t pmd)
+{
+	return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
+}
+
+/*
+ * THP definitions.
+ */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_valid(pmd)		pte_valid(pmd_pte(pmd))
+#define pmd_user(pmd)		pte_user(pmd_pte(pmd))
+#define pmd_user_exec(pmd)	pte_user_exec(pmd_pte(pmd))
+#define pmd_cont(pmd)		pte_cont(pmd_pte(pmd))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_mkwrite_novma(pmd)	pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+	pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
+	pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));
+
+	return pmd;
+}
+
+#define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
+
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
+
+#define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define pmd_devmap(pmd)		pte_devmap(pmd_pte(pmd))
+#endif
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
+}
+
+#define __pmd_to_phys(pmd)	__pte_to_phys(pmd_pte(pmd))
+#define __phys_to_pmd_val(phys)	__phys_to_pte_val(phys)
+#define pmd_pfn(pmd)		((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
+#define pfn_pmd(pfn,prot)	__pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
+
+#define pud_young(pud)		pte_young(pud_pte(pud))
+#define pud_mkyoung(pud)	pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_write(pud)		pte_write(pud_pte(pud))
+
+#define pud_mkhuge(pud)		(__pud(pud_val(pud) & ~PUD_TABLE_BIT))
+
+#define __pud_to_phys(pud)	__pte_to_phys(pud_pte(pud))
+#define __phys_to_pud_val(phys)	__phys_to_pte_val(phys)
+#define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
+#define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+
+static inline void __set_pte_at(struct mm_struct *mm,
+				unsigned long __always_unused addr,
+				pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	__sync_cache_and_tags(pte, nr);
+	__check_safe_pte_update(mm, ptep, pte);
+	__set_pte(ptep, pte);
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+			      pmd_t *pmdp, pmd_t pmd)
+{
+	page_table_check_pmd_set(mm, pmdp, pmd);
+	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd),
+						PMD_SIZE >> PAGE_SHIFT);
+}
+
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	page_table_check_pud_set(mm, pudp, pud);
+	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud),
+						PUD_SIZE >> PAGE_SHIFT);
+}
+
+#define __p4d_to_phys(p4d)	__pte_to_phys(p4d_pte(p4d))
+#define __phys_to_p4d_val(phys)	__phys_to_pte_val(phys)
+
+#define __pgd_to_phys(pgd)	__pte_to_phys(pgd_pte(pgd))
+#define __phys_to_pgd_val(phys)	__phys_to_pte_val(phys)
+
+#define __pgprot_modify(prot,mask,bits) \
+	__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
+
+#define pgprot_nx(prot) \
+	__pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN)
+
+#define pgprot_decrypted(prot) \
+	__pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED)
+#define pgprot_encrypted(prot) \
+	__pgprot_modify(prot, PROT_NS_SHARED, 0)
+
+/*
+ * Mark the prot value as uncacheable and unbufferable.
+ */
+#define pgprot_noncached(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN)
+#define pgprot_writecombine(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
+#define pgprot_device(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN)
+#define pgprot_tagged(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED))
+#define pgprot_mhp	pgprot_tagged
+/*
+ * DMA allocations for non-coherent devices use what the Arm architecture calls
+ * "Normal non-cacheable" memory, which permits speculation, unaligned accesses
+ * and merging of writes.  This is different from "Device-nGnR[nE]" memory which
+ * is intended for MMIO and thus forbids speculation, preserves access size,
+ * requires strict alignment and can also force write responses to come from the
+ * endpoint.
+ */
+#define pgprot_dmacoherent(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, \
+			PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
+
+#ifdef CONFIG_ALTRA_ERRATUM_82288
+extern bool have_altra_erratum_82288;
+extern bool range_is_pci(phys_addr_t, size_t);
+#endif
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+#ifdef CONFIG_ALTRA_ERRATUM_82288
+	phys_addr_t phys;
+	pgprot_t prot;
+
+	if (have_altra_erratum_82288) {
+		phys = __pte_to_phys(pte);
+		prot = __pgprot(pte_val(pte) & ~PTE_ADDR_MASK);
+
+		if (range_is_pci(phys, PAGE_SIZE)) {
+			pte = __pte(__phys_to_pte_val(phys) | pgprot_val(pgprot_device(prot)));
+		}
+	}
+#endif
+
+       return set_pte_bit(pte, __pgprot(PTE_SPECIAL));
+}
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot);
+
+#define pmd_none(pmd)		(!pmd_val(pmd))
+
+#define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
+				 PMD_TYPE_TABLE)
+#define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
+				 PMD_TYPE_SECT)
+#define pmd_leaf(pmd)		(pmd_present(pmd) && !pmd_table(pmd))
+#define pmd_bad(pmd)		(!pmd_table(pmd))
+
+#define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte)	(pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
+#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
+static inline bool pud_sect(pud_t pud) { return false; }
+static inline bool pud_table(pud_t pud) { return true; }
+#else
+#define pud_sect(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
+				 PUD_TYPE_SECT)
+#define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
+				 PUD_TYPE_TABLE)
+#endif
+
+extern pgd_t init_pg_dir[PTRS_PER_PGD];
+extern pgd_t init_pg_end[];
+extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
+extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
+extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
+extern pgd_t reserved_pg_dir[PTRS_PER_PGD];
+
+extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline bool in_swapper_pgdir(void *addr)
+{
+	return ((unsigned long)addr & PAGE_MASK) ==
+	        ((unsigned long)swapper_pg_dir & PAGE_MASK);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef __PAGETABLE_PMD_FOLDED
+	if (in_swapper_pgdir(pmdp)) {
+		set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
+		return;
+	}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+	if (haoc_enabled)
+		iee_rw_gate(IEE_OP_SET_PMD, pmdp, pmd);
+	else
+		WRITE_ONCE(*pmdp, pmd);
+
+	if (pmd_valid(pmd)) {
+		dsb(ishst);
+		isb();
+	}
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	set_pmd(pmdp, __pmd(0));
+}
+
+static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
+{
+	return __pmd_to_phys(pmd);
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+	return (unsigned long)__va(pmd_page_paddr(pmd));
+}
+
+/* Find an entry in the third-level page table. */
+#define pte_offset_phys(dir,addr)	(pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t))
+
+#define pte_set_fixmap(addr)		((pte_t *)set_fixmap_offset(FIX_PTE, addr))
+#define pte_set_fixmap_offset(pmd, addr)	pte_set_fixmap(pte_offset_phys(pmd, addr))
+#define pte_clear_fixmap()		clear_fixmap(FIX_PTE)
+
+#define pmd_page(pmd)			phys_to_page(__pmd_to_phys(pmd))
+
+/* use ONLY for statically allocated translation tables */
+#define pte_offset_kimg(dir,addr)	((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+#define mk_pte(page,prot)	pfn_pte(page_to_pfn(page),prot)
+
+#if CONFIG_PGTABLE_LEVELS > 2
+
+#define pmd_ERROR(e)	\
+	pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
+
+#define pud_none(pud)		(!pud_val(pud))
+#define pud_bad(pud)		((pud_val(pud) & PUD_TYPE_MASK) != \
+				 PUD_TYPE_TABLE)
+#define pud_present(pud)	pte_present(pud_pte(pud))
+#define pud_leaf(pud)		(pud_present(pud) && !pud_table(pud))
+#define pud_valid(pud)		pte_valid(pud_pte(pud))
+#define pud_user(pud)		pte_user(pud_pte(pud))
+#define pud_user_exec(pud)	pte_user_exec(pud_pte(pud))
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+#ifdef __PAGETABLE_PUD_FOLDED
+	if (in_swapper_pgdir(pudp)) {
+		set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud)));
+		return;
+	}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
+	if (haoc_enabled)
+		iee_rw_gate(IEE_OP_SET_PUD, pudp, pud);
+	else
+		WRITE_ONCE(*pudp, pud);
+
+	if (pud_valid(pud)) {
+		dsb(ishst);
+		isb();
+	}
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+	set_pud(pudp, __pud(0));
+}
+
+static inline phys_addr_t pud_page_paddr(pud_t pud)
+{
+	return __pud_to_phys(pud);
+}
+
+static inline pmd_t *pud_pgtable(pud_t pud)
+{
+	return (pmd_t *)__va(pud_page_paddr(pud));
+}
+
+/* Find an entry in the second-level page table. */
+#define pmd_offset_phys(dir, addr)	(pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t))
+
+#define pmd_set_fixmap(addr)		((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
+#define pmd_set_fixmap_offset(pud, addr)	pmd_set_fixmap(pmd_offset_phys(pud, addr))
+#define pmd_clear_fixmap()		clear_fixmap(FIX_PMD)
+
+#define pud_page(pud)			phys_to_page(__pud_to_phys(pud))
+
+/* use ONLY for statically allocated translation tables */
+#define pmd_offset_kimg(dir,addr)	((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
+
+#else
+
+#define pud_page_paddr(pud)	({ BUILD_BUG(); 0; })
+#define pud_user_exec(pud)	pud_user(pud) /* Always 0 with folding */
+
+/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
+#define pmd_set_fixmap(addr)		NULL
+#define pmd_set_fixmap_offset(pudp, addr)	((pmd_t *)pudp)
+#define pmd_clear_fixmap()
+
+#define pmd_offset_kimg(dir,addr)	((pmd_t *)dir)
+
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
+
+#if CONFIG_PGTABLE_LEVELS > 3
+
+#define pud_ERROR(e)	\
+	pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))
+
+#define p4d_none(p4d)		(!p4d_val(p4d))
+#define p4d_bad(p4d)		(!(p4d_val(p4d) & 2))
+#define p4d_present(p4d)	(p4d_val(p4d))
+extern bool check_addr_in_iee_valid(unsigned long addr);
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+
+	if (in_swapper_pgdir(p4dp)) {
+		set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
+		return;
+	}
+	if (haoc_enabled){
+		if (in_tramp_pgdir(p4dp)) {
+			iee_set_static_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
+			return;
+		}
+
+		if (in_idmap_pgdir(p4dp)) {
+			iee_set_static_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
+			return;
+		}
+
+		iee_rw_gate(IEE_OP_SET_P4D, p4dp, p4d);
+	} else {
+		WRITE_ONCE(*p4dp, p4d);
+	}
+
+	dsb(ishst);
+	isb();
+}
+
+static inline void p4d_clear(p4d_t *p4dp)
+{
+	set_p4d(p4dp, __p4d(0));
+}
+
+static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
+{
+	return __p4d_to_phys(p4d);
+}
+
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+	return (pud_t *)__va(p4d_page_paddr(p4d));
+}
+
+/* Find an entry in the first-level page table. */
+#define pud_offset_phys(dir, addr)	(p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))
+
+#define pud_set_fixmap(addr)		((pud_t *)set_fixmap_offset(FIX_PUD, addr))
+#define pud_set_fixmap_offset(p4d, addr)	pud_set_fixmap(pud_offset_phys(p4d, addr))
+#define pud_clear_fixmap()		clear_fixmap(FIX_PUD)
+
+#define p4d_page(p4d)		pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d)))
+
+/* use ONLY for statically allocated translation tables */
+#define pud_offset_kimg(dir,addr)	((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
+
+#else
+
+#define p4d_page_paddr(p4d)	({ BUILD_BUG(); 0;})
+#define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
+
+/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
+#define pud_set_fixmap(addr)		NULL
+#define pud_set_fixmap_offset(pgdp, addr)	((pud_t *)pgdp)
+#define pud_clear_fixmap()
+
+#define pud_offset_kimg(dir,addr)	((pud_t *)dir)
+
+#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
+
+#define pgd_ERROR(e)	\
+	pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
+
+#define pgd_set_fixmap(addr)	((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
+#define pgd_clear_fixmap()	clear_fixmap(FIX_PGD)
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	/*
+	 * Normal and Normal-Tagged are two different memory types and indices
+	 * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
+	 */
+	const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
+			      PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP |
+			      PTE_ATTRINDX_MASK;
+	/* preserve the hardware dirty information */
+	if (pte_hw_dirty(pte))
+		pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+	pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
+	/*
+	 * If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware
+	 * dirtiness again.
+	 */
+	if (pte_sw_dirty(pte))
+		pte = pte_mkdirty(pte);
+	return pte;
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
+}
+
+extern int __ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pmd_t *pmdp,
+					pmd_t entry, int dirty)
+{
+	if (haoc_enabled)
+		return __pmdp_set_access_flags(vma, address, pmdp,
+								entry, dirty);
+	else
+		return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
+							pmd_pte(entry), dirty);
+}
+
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PAGE_TABLE_CHECK
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+	return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
+}
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+	return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+	return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud));
+}
+#endif
+
+/*
+ * Atomic pte/pmd modifications.
+ */
+static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
+					      unsigned long address,
+					      pte_t *ptep)
+{
+	pte_t old_pte, pte;
+
+	pte = __ptep_get(ptep);
+	do {
+		old_pte = pte;
+		pte = pte_mkold(pte);
+		if (haoc_enabled)
+			pte_val(pte) = iee_set_cmpxchg_relaxed(ptep,
+								pte_val(old_pte), pte_val(pte));
+		else
+			pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
+						pte_val(old_pte), pte_val(pte));
+	} while (pte_val(pte) != pte_val(old_pte));
+
+	return pte_young(pte);
+}
+
+static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
+					 unsigned long address, pte_t *ptep)
+{
+	int young = __ptep_test_and_clear_young(vma, address, ptep);
+
+	if (young) {
+		/*
+		 * We can elide the trailing DSB here since the worst that can
+		 * happen is that a CPU continues to use the young entry in its
+		 * TLB and we mistakenly reclaim the associated page. The
+		 * window for such an event is bounded by the next
+		 * context-switch, which provides a DSB to complete the TLB
+		 * invalidation.
+		 */
+		flush_tlb_page_nosync(vma, address);
+	}
+
+	return young;
+}
+
+static inline int __pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					      unsigned long address,
+					      pmd_t *pmdp)
+{
+	pmd_t old_pmd, pmd;
+
+	pmd = READ_ONCE(*pmdp);
+	do {
+		old_pmd = pmd;
+		pmd = __pmd(pmd_val(pmd) & ~PTE_AF);
+		pmd_val(pmd) = iee_set_pmd_cmpxchg_relaxed(pmdp,
+					       pmd_val(old_pmd), pmd_val(pmd));
+	} while (pmd_val(pmd) != pmd_val(old_pmd));
+
+	return pmd_young(pmd);
+}
+
+static inline void __pmdp_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pmd_t *pmdp,
+					pmd_t pmd)
+{
+	pmd_t old_pmd;
+
+	do {
+		old_pmd = pmd;
+		pmd = pmd_wrprotect(pmd);
+		pmd_val(pmd) = iee_set_pmd_cmpxchg_relaxed(pmdp, pmd_val(old_pmd), pmd_val(pmd));
+	} while (pmd_val(pmd) != pmd_val(old_pmd));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address,
+					    pmd_t *pmdp)
+{
+	if (haoc_enabled)
+		return __pmdp_test_and_clear_young(vma, address, pmdp);
+	else
+		return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
+				       unsigned long address, pte_t *ptep)
+{
+	pte_t pte;
+	
+	if (haoc_enabled){
+		pteval_t pteval = iee_set_xchg_relaxed((pte_t *)&pte_val(*ptep), (pteval_t)0);
+		pte = __pte(pteval);
+	} else{
+		pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
+	}
+
+	page_table_check_pte_clear(mm, pte);
+
+	return pte;
+}
+
+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		__ptep_get_and_clear(mm, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte, tmp_pte;
+
+	pte = __ptep_get_and_clear(mm, addr, ptep);
+	while (--nr) {
+		ptep++;
+		addr += PAGE_SIZE;
+		tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+		if (pte_dirty(tmp_pte))
+			pte = pte_mkdirty(pte);
+		if (pte_young(tmp_pte))
+			pte = pte_mkyoung(pte);
+	}
+	return pte;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address, pmd_t *pmdp)
+{
+	pmd_t pmd;
+	
+	if (haoc_enabled){
+		pteval_t pteval = iee_set_xchg_relaxed((pte_t *)&pmd_val(*pmdp), (pteval_t)0);
+		pmd = __pmd(pteval);
+	} else {
+		pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0));
+	}
+
+	page_table_check_pmd_clear(mm, pmd);
+
+	return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep,
+					pte_t pte)
+{
+	pte_t old_pte;
+
+	do {
+		old_pte = pte;
+		pte = pte_wrprotect(pte);
+		if (haoc_enabled)
+			pte_val(pte) = iee_set_cmpxchg_relaxed(ptep, pte_val(old_pte), pte_val(pte));
+		else 
+			pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
+						pte_val(old_pte), pte_val(pte));
+	} while (pte_val(pte) != pte_val(old_pte));
+}
+
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep)
+{
+	___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+				pte_t *ptep, unsigned int nr)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+		__ptep_set_wrprotect(mm, address, ptep);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pmd_t *pmdp)
+{
+	if (haoc_enabled)
+		__pmdp_set_wrprotect(mm, address, pmdp, READ_ONCE(*pmdp));
+	else
+		__ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+}
+
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+	page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
+	if (haoc_enabled)
+		return __pmd(iee_set_pmd_xchg_relaxed((pmd_t *)&pmd_val(*pmdp), pmd_val(pmd)));
+	else
+		return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
+}
+#endif
+
+/*
+ * Encode and decode a swap entry:
+ *	bits 0-1:	present (must be zero)
+ *	bits 2:		remember PG_anon_exclusive
+ *	bits 3-7:	swap type
+ *	bits 8-57:	swap offset
+ *	bit  58:	PTE_PROT_NONE (must be zero)
+ */
+#define __SWP_TYPE_SHIFT	3
+#define __SWP_TYPE_BITS		5
+#define __SWP_OFFSET_BITS	50
+#define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
+#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
+#define __SWP_OFFSET_MASK	((1UL << __SWP_OFFSET_BITS) - 1)
+
+#define __swp_type(x)		(((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
+#define __swp_offset(x)		(((x).val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK)
+#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) })
+
+#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
+#define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#define __pmd_to_swp_entry(pmd)		((swp_entry_t) { pmd_val(pmd) })
+#define __swp_entry_to_pmd(swp)		__pmd((swp).val)
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+/*
+ * Ensure that there are not more swap files than can be encoded in the kernel
+ * PTEs.
+ */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
+
+#ifdef CONFIG_ARM64_MTE
+
+#define __HAVE_ARCH_PREPARE_TO_SWAP
+extern int arch_prepare_to_swap(struct folio *folio);
+
+#define __HAVE_ARCH_SWAP_INVALIDATE
+static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
+{
+	if (system_supports_mte())
+		mte_invalidate_tags(type, offset);
+}
+
+static inline void arch_swap_invalidate_area(int type)
+{
+	if (system_supports_mte())
+		mte_invalidate_tags_area(type);
+}
+
+#define __HAVE_ARCH_SWAP_RESTORE
+extern void arch_swap_restore(swp_entry_t entry, struct folio *folio);
+
+#endif /* CONFIG_ARM64_MTE */
+
+/*
+ * On AArch64, the cache coherency is handled via the __set_ptes() function.
+ */
+static inline void update_mmu_cache_range(struct vm_fault *vmf,
+		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+		unsigned int nr)
+{
+	/*
+	 * We don't do anything here, so there's a very small chance of
+	 * us retaking a user fault which we just fixed up. The alternative
+	 * is doing a dsb(ishst), but that penalises the fastpath.
+	 */
+}
+
+#define update_mmu_cache(vma, addr, ptep) \
+	update_mmu_cache_range(NULL, vma, addr, ptep, 1)
+#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
+
+#ifdef CONFIG_ARM64_PA_BITS_52
+#define phys_to_ttbr(addr)	(((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52)
+#else
+#define phys_to_ttbr(addr)	(addr)
+#endif
+
+/*
+ * On arm64 without hardware Access Flag, copying from user will fail because
+ * the pte is old and cannot be marked young. So we always end up with zeroed
+ * page after fork() + CoW for pfn mappings. We don't always have a
+ * hardware-managed access flag on arm64.
+ */
+#define arch_has_hw_pte_young		cpu_has_hw_af
+
+/*
+ * Experimentally, it's cheap to set the access flag in hardware and we
+ * benefit from prefaulting mappings as 'old' to start with.
+ */
+#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
+
+static inline bool pud_sect_supported(void)
+{
+	return PAGE_SIZE == SZ_4K;
+}
+
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+#define ptep_modify_prot_start ptep_modify_prot_start
+extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
+				    unsigned long addr, pte_t *ptep);
+
+#define ptep_modify_prot_commit ptep_modify_prot_commit
+extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
+				    unsigned long addr, pte_t *ptep,
+				    pte_t old_pte, pte_t new_pte);
+
+#ifdef CONFIG_ARM64_CONTPTE
+
+/*
+ * The contpte APIs are used to transparently manage the contiguous bit in ptes
+ * where it is possible and makes sense to do so. The PTE_CONT bit is considered
+ * a private implementation detail of the public ptep API (see below).
+ */
+extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
+extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
+extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
+extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr);
+extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full);
+extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full);
+extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr);
+extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty);
+
+static __always_inline void contpte_try_fold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	/*
+	 * Only bother trying if both the virtual and physical addresses are
+	 * aligned and correspond to the last entry in a contig range. The core
+	 * code mostly modifies ranges from low to high, so this is the likely
+	 * the last modification in the contig range, so a good time to fold.
+	 * We can't fold special mappings, because there is no associated folio.
+	 */
+
+	const unsigned long contmask = CONT_PTES - 1;
+	bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;
+
+	if (unlikely(valign)) {
+		bool palign = (pte_pfn(pte) & contmask) == contmask;
+
+		if (unlikely(palign &&
+		    pte_valid(pte) && !pte_cont(pte) && !pte_special(pte)))
+			__contpte_try_fold(mm, addr, ptep, pte);
+	}
+}
+
+static __always_inline void contpte_try_unfold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	if (unlikely(pte_valid_cont(pte)))
+		__contpte_try_unfold(mm, addr, ptep, pte);
+}
+
+#define pte_batch_hint pte_batch_hint
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+	if (!pte_valid_cont(pte))
+		return 1;
+
+	return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
+}
+
+/*
+ * The below functions constitute the public API that arm64 presents to the
+ * core-mm to manipulate PTE entries within their page tables (or at least this
+ * is the subset of the API that arm64 needs to implement). These public
+ * versions will automatically and transparently apply the contiguous bit where
+ * it makes sense to do so. Therefore any users that are contig-aware (e.g.
+ * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
+ * private versions, which are prefixed with double underscore. All of these
+ * APIs except for ptep_get_lockless() are expected to be called with the PTL
+ * held. Although the contiguous bit is considered private to the
+ * implementation, it is deliberately allowed to leak through the getters (e.g.
+ * ptep_get()), back to core code. This is required so that pte_leaf_size() can
+ * provide an accurate size for perf_get_pgtable_size(). But this leakage means
+ * its possible a pte will be passed to a setter with the contiguous bit set, so
+ * we explicitly clear the contiguous bit in those cases to prevent accidentally
+ * setting it in the pgtable.
+ */
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get_lockless(ptep);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We don't have the mm or vaddr so cannot unfold contig entries (since
+	 * it requires tlb maintenance). set_pte() is not used in core code, so
+	 * this should never even be called. Regardless do our best to service
+	 * any call and emit a warning if there is any attempt to set a pte on
+	 * top of an existing contig range.
+	 */
+	pte_t orig_pte = __ptep_get(ptep);
+
+	WARN_ON_ONCE(pte_valid_cont(orig_pte));
+	__set_pte(ptep, pte_mknoncont(pte));
+}
+
+#define set_ptes set_ptes
+static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	pte = pte_mknoncont(pte);
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__set_ptes(mm, addr, ptep, pte, 1);
+		contpte_try_fold(mm, addr, ptep, pte);
+	} else {
+		contpte_set_ptes(mm, addr, ptep, pte, nr);
+	}
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	__pte_clear(mm, addr, ptep);
+}
+
+#define clear_full_ptes clear_full_ptes
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		contpte_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+}
+
+#define get_and_clear_full_ptes get_and_clear_full_ptes
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte;
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+
+	return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	return __ptep_get_and_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_test_and_clear_young(vma, addr, ptep);
+
+	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_clear_flush_young(vma, addr, ptep);
+
+	return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
+
+#define wrprotect_ptes wrprotect_ptes
+static __always_inline void wrprotect_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+	if (likely(nr == 1)) {
+		/*
+		 * Optimization: wrprotect_ptes() can only be called for present
+		 * ptes so we only need to check contig bit as condition for
+		 * unfold, and we can remove the contig bit from the pte we read
+		 * to avoid re-reading. This speeds up fork() which is sensitive
+		 * for order-0 folios. Equivalent to contpte_try_unfold().
+		 */
+		pte_t orig_pte = __ptep_get(ptep);
+
+		if (unlikely(pte_cont(orig_pte))) {
+			__contpte_try_unfold(mm, addr, ptep, orig_pte);
+			orig_pte = pte_mknoncont(orig_pte);
+		}
+		___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+	} else {
+		contpte_wrprotect_ptes(mm, addr, ptep, nr);
+	}
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	wrprotect_ptes(mm, addr, ptep, 1);
+}
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	entry = pte_mknoncont(entry);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+	return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
+
+#else /* CONFIG_ARM64_CONTPTE */
+
+#define ptep_get				__ptep_get
+#define set_pte					__set_pte
+#define set_ptes				__set_ptes
+#define pte_clear				__pte_clear
+#define clear_full_ptes				__clear_full_ptes
+#define get_and_clear_full_ptes			__get_and_clear_full_ptes
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear			__ptep_get_and_clear
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young		__ptep_test_and_clear_young
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young			__ptep_clear_flush_young
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect			__ptep_set_wrprotect
+#define wrprotect_ptes				__wrprotect_ptes
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags			__ptep_set_access_flags
+
+#endif /* CONFIG_ARM64_CONTPTE */
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/haoc/iee-ptp-init.h b/arch/arm64/include/asm/haoc/iee-ptp-init.h
new file mode 100644
index 000000000000..7dd6bd870189
--- /dev/null
+++ b/arch/arm64/include/asm/haoc/iee-ptp-init.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IEE_PTP_INIT_H
+#define _LINUX_IEE_PTP_INIT_H
+
+#include <asm/kernel-pgtable.h>
+
+#define NR_BM_PTE_TABLES \
+	SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT)
+#define NR_BM_PMD_TABLES \
+	SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PUD_SHIFT)
+
+static_assert(NR_BM_PMD_TABLES == 1);
+
+#define __BM_TABLE_IDX(addr, shift) \
+	(((addr) >> (shift)) - (FIXADDR_TOT_START >> (shift)))
+
+#define BM_PTE_TABLE_IDX(addr)	__BM_TABLE_IDX(addr, PMD_SHIFT)
+
+extern pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __section(".iee.ptp") __aligned(PAGE_SIZE);
+extern pmd_t bm_pmd[PTRS_PER_PMD] __section(".iee.ptp") __aligned(PAGE_SIZE) __maybe_unused;
+extern pud_t bm_pud[PTRS_PER_PUD] __section(".iee.ptp") __aligned(PAGE_SIZE) __maybe_unused;
+
+extern void __iee_pgd_populate_pre_init(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot);
+extern void __iee_p4d_populate_pre_init(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot);
+extern void __iee_pud_populate_pre_init(pud_t *pudp, phys_addr_t pmdp, pudval_t prot);
+extern void __iee_pmd_populate_pre_init(pmd_t *pmdp, phys_addr_t ptep,
+				  pmdval_t prot);
+extern int iee_pud_set_huge_pre_init(pud_t *pudp, phys_addr_t phys, pgprot_t prot);
+extern int iee_pmd_set_huge_pre_init(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot);
+
+extern void __iee_set_fixmap_pre_init(enum fixed_addresses idx,
+			       phys_addr_t phys, pgprot_t flags);
+
+extern void iee_set_pgtable_pre_init(unsigned long *addr, unsigned long content);
+extern void set_iee_address_pre_init(unsigned long addr, bool valid);
+extern void iee_alloc_init_pud_pre_init(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			   phys_addr_t phys, pgprot_t prot,
+			   phys_addr_t (*pgtable_alloc)(int),
+			   int flags);
+extern void __create_pgd_mapping_pre_init(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int),
+				 int flags);
+
+extern void __init efi_memmap_unmap_after_init(void);
+
+extern int early_ioremap_debug __initdata;
+extern int after_paging_init __initdata;
+extern void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+extern unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+extern unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+extern bool pgattr_change_is_safe(u64 old, u64 new);
+
+void __init fixmap_copy_ptp(pgd_t *pgdir);
+
+#endif
diff --git a/arch/arm64/include/asm/haoc/iee.h b/arch/arm64/include/asm/haoc/iee.h
index 7137aa186d29..f10d4c620312 100644
--- a/arch/arm64/include/asm/haoc/iee.h
+++ b/arch/arm64/include/asm/haoc/iee.h
@@ -69,6 +69,7 @@ extern bool haoc_enabled;
 void iee_init_mappings(pgd_t *pgdp);
 void iee_init_post(void);
 void iee_stack_init(void);
+void iee_init_tcr_ptp(void);
 void iee_init_tcr(void);
 void iee_setup_asid(void);
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 48337fa70219..391b0da496da 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -5,6 +5,10 @@
 #ifndef __ASM_PGTABLE_H
 #define __ASM_PGTABLE_H
 
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-pgtable.h>
+#else
+
 #include <asm/bug.h>
 #include <asm/proc-fns.h>
 
@@ -1472,4 +1476,6 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 
 #endif /* !__ASSEMBLY__ */
 
+#endif /* !CONFIG_PTP */
+
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 2c29239d05c3..ba713e53f419 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -13,7 +13,14 @@
 
 static inline void __tlb_remove_table(void *_table)
 {
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		ptp_pg_free(&pg_cache, page_to_virt((struct page *)_table));
+	else
+		free_page_and_swap_cache((struct page *)_table);
+	#else
 	free_page_and_swap_cache((struct page *)_table);
+	#endif
 }
 
 #define tlb_flush tlb_flush
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d93e9a3fceaf..be4262e476a4 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -93,6 +93,9 @@
 #include <asm/traps.h>
 #include <asm/vectors.h>
 #include <asm/virt.h>
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-mmu.h>
+#endif
 
 /* Kernel representation of AT_HWCAP and AT_HWCAP2 */
 static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
@@ -1852,7 +1855,14 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
 	if (!cpu) {
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			alloc = __va(early_iee_pgtable_alloc(0));
+		else
+			alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+		#else
 		alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+		#endif
 		kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE);
 		kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd);
 
diff --git a/arch/arm64/kernel/haoc/Kconfig b/arch/arm64/kernel/haoc/Kconfig
index 7aa900608650..e1b2e7f79fa3 100644
--- a/arch/arm64/kernel/haoc/Kconfig
+++ b/arch/arm64/kernel/haoc/Kconfig
@@ -40,4 +40,18 @@ config CREDP
 		by IEE either.
 		If unsure, say N.
 
+config PTP
+	bool "Page Table Protection(PTP)"
+	help
+		Protects page tables by IEE, requring each page table modification call
+		IEE Gate for secure mapping.
+	depends on IEE
+
+config PTP_RESERVE_ORDER
+	depends on PTP
+	int "maximum allowable 2^PTP_RESERVE_ORDER pages for one level page table"
+	range 9 15
+	default 12
+
+
 endmenu # HAOC
diff --git a/arch/arm64/kernel/haoc/Makefile b/arch/arm64/kernel/haoc/Makefile
index 62b467471edc..18a8a831d2fb 100644
--- a/arch/arm64/kernel/haoc/Makefile
+++ b/arch/arm64/kernel/haoc/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y	+= haoc.o
+obj-y 	+= haoc.o haoc-bitmap.o
 obj-y	+= iee/
-obj-$(CONFIG_CREDP) += credp/
\ No newline at end of file
+obj-$(CONFIG_CREDP) += credp/
+obj-$(CONFIG_PTP) += ptp/
\ No newline at end of file
diff --git a/arch/arm64/kernel/haoc/haoc-bitmap.c b/arch/arm64/kernel/haoc/haoc-bitmap.c
new file mode 100644
index 000000000000..855b6036e37d
--- /dev/null
+++ b/arch/arm64/kernel/haoc/haoc-bitmap.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+#include <asm/haoc/iee.h>
+#include <asm/haoc/iee-init.h>
+#include <asm/haoc/haoc.h>
+#include <asm/haoc/haoc-bitmap.h>
+#include <asm/haoc/iee-mmu.h>
+
+bool haoc_bitmap_ready;
+
+static void * __init haoc_bitmap_alloc_block_zero(unsigned long size)
+{
+	void *p;
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		p = __va(early_iee_pgtable_alloc(0));
+	else
+		p = memblock_alloc(size, size);
+	#else
+	p = memblock_alloc(size, size);
+	#endif
+	if (!p)
+		return NULL;
+	return p;
+}
+
+static pte_t * __init haoc_bitmap_pte_populate(pmd_t *pmd, unsigned long addr)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(ptep_get(pte))) {
+		pte_t entry;
+		void *p = __va(early_iee_data_alloc(0));
+		if (!p)
+			return NULL;
+
+		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+		#ifdef CONFIG_PTP
+		if (haoc_enabled){
+			write_sysreg(read_sysreg(TCR_EL1) | TCR_HPD1 | TCR_A1, tcr_el1);
+			isb();
+			WRITE_ONCE(*__ptr_to_iee(pte), entry);
+			write_sysreg(read_sysreg(TCR_EL1) & ~(TCR_HPD1 | TCR_A1), tcr_el1);
+			isb();
+		}
+		else
+			set_pte_at(&init_mm, addr, pte, entry);
+		#else
+		set_pte_at(&init_mm, addr, pte, entry);
+		#endif
+	}
+	return pte;
+}
+
+static pmd_t * __init haoc_bitmap_pmd_populate(pud_t *pud, unsigned long addr)
+{
+	pmd_t *pmd = pmd_offset(pud, addr);
+	void *p;
+
+	if (pmd_none(*pmd)) {
+		p = haoc_bitmap_alloc_block_zero(PAGE_SIZE);
+		if (!p)
+			return NULL;
+		pmd_populate_kernel(&init_mm, pmd, p);
+	}
+	return pmd;
+}
+
+static pud_t * __init haoc_bitmap_pud_populate(p4d_t *p4d, unsigned long addr)
+{
+	pud_t *pud = pud_offset(p4d, addr);
+	void *p;
+
+	if (pud_none(*pud)) {
+		p = haoc_bitmap_alloc_block_zero(PAGE_SIZE);
+		if (!p)
+			return NULL;
+		pmd_init(p);
+		pud_populate(&init_mm, pud, p);
+	}
+	return pud;
+}
+
+static p4d_t * __init haoc_bitmap_p4d_populate(pgd_t *pgd, unsigned long addr)
+{
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	void *p;
+
+	if (p4d_none(*p4d)) {
+		p = haoc_bitmap_alloc_block_zero(PAGE_SIZE);
+		if (!p)
+			return NULL;
+		pud_init(p);
+		p4d_populate(&init_mm, p4d, p);
+	}
+	return p4d;
+}
+
+static pgd_t * __init haoc_bitmap_pgd_populate(unsigned long addr)
+{
+	pgd_t *pgd = pgd_offset_k(addr);
+	void *p;
+
+	if (pgd_none(*pgd)) {
+		p = haoc_bitmap_alloc_block_zero(PAGE_SIZE);
+		if (!p)
+			return NULL;
+		pgd_populate(&init_mm, pgd, p);
+	}
+	return pgd;
+}
+
+/* Create mappings if that address is not mapped. */
+static pte_t * __init haoc_bitmap_populate_address(unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = haoc_bitmap_pgd_populate(addr);
+	if (!pgd)
+		return NULL;
+	p4d = haoc_bitmap_p4d_populate(pgd, addr);
+	if (!p4d)
+		return NULL;
+	pud = haoc_bitmap_pud_populate(p4d, addr);
+	if (!pud)
+		return NULL;
+	pmd = haoc_bitmap_pmd_populate(pud, addr);
+	if (!pmd)
+		return NULL;
+	pte = haoc_bitmap_pte_populate(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	return pte;
+}
+
+/* Map haoc bitmap array after vmemmap region. */
+int __init haoc_bitmap_sparse_init(void)
+{
+	unsigned long start_pfn, end_pfn;
+	int i, nid;
+	/* Iterate through available memory blocks. */
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		u64 bitmap_start = ALIGN_DOWN(((u64)__pfn_to_haoc_bitmap(start_pfn)),
+								PAGE_SIZE);
+		u64 bitmap_end = ALIGN(((u64)__pfn_to_haoc_bitmap(end_pfn)),
+								PAGE_SIZE);
+		u64 addr = bitmap_start;
+		pte_t *pte;
+
+		for (; addr < bitmap_end; addr += PAGE_SIZE) {
+			pte = haoc_bitmap_populate_address(addr);
+			if (!pte)
+				pr_err("HAOC: failed on bitmap init.");
+		}
+	}
+	haoc_bitmap_ready = true;
+	return 0;
+}
+
+void __init haoc_bitmap_setup(void)
+{
+	/* Setup bitmap types of global data. */
+	#ifdef CONFIG_PTP
+	{
+		unsigned long start, end, num_pages;
+
+		start = (unsigned long)idmap_pg_dir;
+		end = (unsigned long)__iee_ptp_data_end;
+		num_pages = (end - start) / PAGE_SIZE;
+		if (haoc_enabled)
+			iee_set_bitmap_type(start, num_pages, IEE_PGTABLE);
+	}
+	#endif
+
+	/* Mark bitmaps of early allocated iee data. */
+	setup_iee_early_data_bitmap();
+}
+
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+static void __iee_code _iee_bitmap_memset(void *ptr, int data, size_t n)
+{
+	char *_ptr;
+
+	_ptr = (char *)ptr;
+
+	while (n--)
+		*_ptr++ = data;
+}
+#pragma GCC pop_options
+
+void __iee_code _iee_set_bitmap_type(unsigned long __unused,
+				u64 va, enum HAOC_BITMAP_TYPE type, int num_pages)
+{
+	// _iee_bitmap_memset(__va_to_haoc_bitmap(va), type, num_pages);
+}
diff --git a/arch/arm64/kernel/haoc/haoc.c b/arch/arm64/kernel/haoc/haoc.c
index d58740aafc3b..64c518a5b4b9 100644
--- a/arch/arm64/kernel/haoc/haoc.c
+++ b/arch/arm64/kernel/haoc/haoc.c
@@ -8,6 +8,7 @@
  */
 
 #include <asm/haoc/haoc.h>
+#include <asm/haoc/haoc-bitmap.h>
 
 typedef void (*iee_func)(void);
 
@@ -28,6 +29,7 @@ iee_func iee_funcs[] = {
 	(iee_func)_iee_invalidate_token,
 	(iee_func)_iee_validate_token,
 #endif
+	(iee_func)_iee_set_bitmap_type,
 #ifdef CONFIG_CREDP
 	(iee_func)_iee_copy_cred,
 	(iee_func)_iee_set_cred_uid,
@@ -58,6 +60,21 @@ iee_func iee_funcs[] = {
 	(iee_func)_iee_set_cred_security,
 	(iee_func)_iee_set_cred_rcu,
 	(iee_func)_iee_set_cred_ucounts,
+#endif
+#ifdef CONFIG_PTP
+	(iee_func)_iee_set_static_pgd,
+	(iee_func)_iee_set_bm_pte,
+	(iee_func)_iee_set_pte,
+	(iee_func)_iee_set_pmd,
+	(iee_func)_iee_set_pud,
+	(iee_func)_iee_set_p4d,
+	(iee_func)_iee_set_swapper_pgd,
+	(iee_func)_iee_set_xchg_relaxed,
+	(iee_func)_iee_set_pmd_xchg_relaxed,
+	(iee_func)_iee_set_cmpxchg_relaxed,
+	(iee_func)_iee_set_pmd_cmpxchg_relaxed,
+	(iee_func)_iee_set_sensitive_pte,
+	(iee_func)_iee_unset_sensitive_pte,
 #endif
 	NULL
 };
diff --git a/arch/arm64/kernel/haoc/iee/iee-func.c b/arch/arm64/kernel/haoc/iee/iee-func.c
index bc77be80f181..8862d10525da 100644
--- a/arch/arm64/kernel/haoc/iee/iee-func.c
+++ b/arch/arm64/kernel/haoc/iee/iee-func.c
@@ -8,6 +8,7 @@
  */
 
 #include <asm/haoc/haoc-def.h>
+#include <asm/haoc/haoc-bitmap.h>
 #include <asm/haoc/iee.h>
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
@@ -216,6 +217,41 @@ void set_iee_address_invalid(unsigned long lm_addr, unsigned int order)
 static void iee_set_sensitive_pte(pte_t *lm_ptep, pte_t *iee_ptep, int order,
 				int use_block_pmd)
 {
+#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		iee_rw_gate(IEE_OP_SET_SENSITIVE_PTE, lm_ptep, iee_ptep, order, use_block_pmd);
+	else{
+		int i;
+
+		if (use_block_pmd) {
+			pmd_t pmd = __pmd(pte_val(READ_ONCE(*lm_ptep)));
+
+			pmd = __pmd((pmd_val(pmd) | PMD_SECT_RDONLY) & ~PTE_DBM);
+			WRITE_ONCE(*lm_ptep, __pte(pmd_val(pmd)));
+			for (i = 0; i < (1 << order); i++) {
+				pte_t pte = READ_ONCE(*iee_ptep);
+
+				pte = __pte(pte_val(pte) | PTE_VALID);
+				WRITE_ONCE(*iee_ptep, pte);
+				iee_ptep++;
+			}
+		} else {
+			for (i = 0; i < (1 << order); i++) {
+				pte_t pte = READ_ONCE(*lm_ptep);
+
+				pte = __pte((pte_val(pte) | PTE_RDONLY) & ~PTE_DBM);
+				WRITE_ONCE(*lm_ptep, pte);
+				pte = READ_ONCE(*iee_ptep);
+				pte = __pte(pte_val(pte) | PTE_VALID);
+				WRITE_ONCE(*iee_ptep, pte);
+				lm_ptep++;
+				iee_ptep++;
+			}
+		}
+		dsb(ishst);
+		isb();
+	}
+#else
 	int i;
 
 	if (use_block_pmd) {
@@ -245,10 +281,46 @@ static void iee_set_sensitive_pte(pte_t *lm_ptep, pte_t *iee_ptep, int order,
 	}
 	dsb(ishst);
 	isb();
+#endif
 }
 
 static void iee_unset_sensitive_pte(pte_t *lm_ptep, pte_t *iee_ptep, int order, int use_block_pmd)
 {
+#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		iee_rw_gate(IEE_OP_UNSET_SENSITIVE_PTE, lm_ptep, iee_ptep, order, use_block_pmd);
+	else {
+		int i;
+
+		if (use_block_pmd) {
+			pmd_t pmd = __pmd(pte_val(READ_ONCE(*lm_ptep)));
+
+			pmd = __pmd(pmd_val(pmd) | PTE_DBM);
+			WRITE_ONCE(*lm_ptep, __pte(pmd_val(pmd)));
+			for (i = 0; i < (1 << order); i++) {
+				pte_t pte = READ_ONCE(*iee_ptep);
+
+				pte = __pte(pte_val(pte) & ~PTE_VALID);
+				WRITE_ONCE(*iee_ptep, pte);
+				iee_ptep++;
+			}
+		} else {
+			for (i = 0; i < (1 << order); i++) {
+				pte_t pte = READ_ONCE(*lm_ptep);
+
+				pte = __pte(pte_val(pte) | PTE_DBM);
+				WRITE_ONCE(*lm_ptep, pte);
+				pte = READ_ONCE(*iee_ptep);
+				pte = __pte(pte_val(pte) & ~PTE_VALID);
+				WRITE_ONCE(*iee_ptep, pte);
+				lm_ptep++;
+				iee_ptep++;
+			}
+		}
+		dsb(ishst);
+		isb();
+	}
+#else
 	int i;
 
 	if (use_block_pmd) {
@@ -278,6 +350,7 @@ static void iee_unset_sensitive_pte(pte_t *lm_ptep, pte_t *iee_ptep, int order,
 	}
 	dsb(ishst);
 	isb();
+#endif
 }
 
 /* Only support address range smaller then one PMD block. */
@@ -362,7 +435,7 @@ void put_pages_into_iee(unsigned long addr, int order)
 /* The reverse operation of put_pages_into_iee().
  * Call this function when you are returning pages back to kernel.
  */
-static void remove_pages_from_iee(unsigned long addr, int order)
+void remove_pages_from_iee(unsigned long addr, int order)
 {
 	pgd_t *pgdir = swapper_pg_dir;
 	pgd_t *pgdp = pgd_offset_pgd(pgdir, addr);
diff --git a/arch/arm64/kernel/haoc/iee/iee-init.c b/arch/arm64/kernel/haoc/iee/iee-init.c
index 8c0df933f0dc..980218df7945 100644
--- a/arch/arm64/kernel/haoc/iee/iee-init.c
+++ b/arch/arm64/kernel/haoc/iee/iee-init.c
@@ -15,6 +15,7 @@
 #ifdef CONFIG_IEE_PTRP
 #include <asm/haoc/iee-token.h>
 #endif
+#include <asm/haoc/haoc-bitmap.h>
 #include <asm/haoc/iee-asm.h>
 
 __aligned(PAGE_SIZE) DEFINE_PER_CPU(u64*[(PAGE_SIZE/8)],
@@ -61,7 +62,7 @@ void iee_setup_asid(void)
 static void iee_setup_init_data(void){
 	for (u64 addr = (u64)iee_init_data_begin; addr < (u64)iee_init_data_end;
 			addr += PAGE_SIZE)
-		iee_set_logical_mem(addr, 0, true);
+		iee_set_logical_mem(addr, 0, false);
 }
 
 void __init iee_init_post(void)
diff --git a/arch/arm64/kernel/haoc/iee/iee-mmu.c b/arch/arm64/kernel/haoc/iee/iee-mmu.c
index 3dc9e60095ec..e0265ad44c00 100644
--- a/arch/arm64/kernel/haoc/iee/iee-mmu.c
+++ b/arch/arm64/kernel/haoc/iee/iee-mmu.c
@@ -16,6 +16,9 @@
 #include <asm/haoc/iee.h>
 #include <asm/haoc/iee-asm.h>
 #include <asm/haoc/iee-init.h>
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-ptp-init.h>
+#endif
 
 #define IEE_EARLY_BLOCK_NR	64
 
@@ -43,7 +46,15 @@ static struct iee_early_alloc iee_stack = {
 	.curr_block_nr = -1
 };
 
-static DEFINE_MUTEX(fixmap_lock);
+#ifdef CONFIG_PTP
+static struct iee_early_alloc iee_pgtable = {
+	.name = "iee_early_pgtable",
+	.curr_block_nr = -1
+};
+#endif
+
+DEFINE_SPINLOCK(swapper_pgdir_lock);
+DEFINE_MUTEX(fixmap_lock);
 
 __aligned(PAGE_SIZE) DECLARE_PER_CPU(u64*[(PAGE_SIZE/8)],
 				iee_cpu_stack_ptr);
@@ -54,6 +65,17 @@ __aligned(IEE_STACK_SIZE) __initdata u64 iee_init_stack[IEE_STACK_SIZE/8];
 /* Setup global values used in verifications of TCR_EL1 to protect IEE switch gate.
  * Use fixmap functions as these globals are put inside IEE text section.
  */
+#ifdef CONFIG_PTP
+void __init iee_init_tcr_ptp(void)
+{
+	unsigned long ptr = pte_set_fixmap_pre_init(__pa_symbol(&kernel_tcr));
+	*((u64 *)ptr) = read_sysreg(tcr_el1) & IEE_TCR_MASK & ~(TCR_HPD1 | TCR_A1);
+	pte_clear_fixmap_pre_init();
+	ptr = pte_set_fixmap_pre_init(__pa_symbol(&iee_tcr));
+	*((u64 *)ptr) = kernel_tcr | TCR_HPD1 | TCR_A1;
+	pte_clear_fixmap_pre_init();
+}
+#else
 void __init iee_init_tcr(void)
 {
 	unsigned long ptr = (unsigned long)(fix_to_virt(FIX_PTE));
@@ -68,6 +90,7 @@ void __init iee_init_tcr(void)
 	*((u64 *)ptr) = kernel_tcr | TCR_HPD1 | TCR_A1;
 	clear_fixmap(FIX_PTE);
 }
+#endif
 
 static void __init iee_setup_bootcpu_stack(void)
 {
@@ -105,7 +128,14 @@ static phys_addr_t __init iee_mem_pool_early_alloc(struct iee_early_alloc *cache
 	 * any level of table.
 	 */
 	for (i = 0; i < (1 << (order)); i++) {
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+		ptr = pte_set_fixmap_pre_init(phys + i * PAGE_SIZE);
+		else
 		ptr = pte_set_fixmap(phys + i * PAGE_SIZE);
+		#else
+		ptr = pte_set_fixmap(phys + i * PAGE_SIZE);
+		#endif
 
 		memset(ptr, 0, PAGE_SIZE);
 
@@ -113,7 +143,14 @@ static phys_addr_t __init iee_mem_pool_early_alloc(struct iee_early_alloc *cache
 		 * Implicit barriers also ensure the zeroed page is visible to the page
 		 * table walker
 		 */
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			pte_clear_fixmap_pre_init();
+		else
+			pte_clear_fixmap();
+		#else
 		pte_clear_fixmap();
+		#endif
 	}
 
 	cache->begin = phys;
@@ -157,6 +194,9 @@ void __init early_iee_data_cache_init(void)
 	iee_mem_pool_early_alloc(&iee_stack, IEE_DATA_ORDER);
 	/* Calculate IEE data alloc block size. */
 	iee_mem_pool_early_alloc(&iee_data, get_iee_alloc_order(1));
+	#ifdef CONFIG_PTP
+	iee_mem_pool_early_alloc(&iee_pgtable, get_iee_alloc_order(0));
+	#endif
 }
 
 phys_addr_t __init iee_early_alloc(struct iee_early_alloc *cache,
@@ -201,8 +241,73 @@ phys_addr_t __init early_iee_data_alloc(int shift)
 	return iee_early_alloc(&iee_data, 0);
 }
 
-static phys_addr_t __init early_pgtable_alloc(int shift)
+#ifdef CONFIG_PTP
+phys_addr_t __init early_iee_pgtable_alloc(int shift)
+{
+	return iee_early_alloc(&iee_pgtable, 0);
+}
+#endif
+
+void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	pgd_t *fixmap_pgdp;
+
+	spin_lock(&swapper_pgdir_lock);
+	fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
+	WRITE_ONCE(*fixmap_pgdp, pgd);
+	/*
+	 * We need dsb(ishst) here to ensure the page-table-walker sees
+	 * our new entry before set_p?d() returns. The fixmap's
+	 * flush_tlb_kernel_range() via clear_fixmap() does this for us.
+	 */
+	pgd_clear_fixmap();
+	spin_unlock(&swapper_pgdir_lock);
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+			      unsigned long size, pgprot_t vma_prot)
+{
+	if (!pfn_is_map_memory(pfn))
+		return pgprot_noncached(vma_prot);
+	else if (file->f_flags & O_SYNC)
+		return pgprot_writecombine(vma_prot);
+	return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
+
+phys_addr_t __init early_pgtable_alloc(int shift)
 {
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		return early_iee_pgtable_alloc(shift);
+	else
+	{
+		phys_addr_t phys;
+		void *ptr;
+
+		phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+						MEMBLOCK_ALLOC_NOLEAKTRACE);
+		if (!phys)
+			panic("Failed to allocate page table page\n");
+
+		/*
+		* The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
+		* slot will be free, so we can (ab)use the FIX_PTE slot to initialise
+		* any level of table.
+		*/
+		ptr = pte_set_fixmap(phys);
+
+		memset(ptr, 0, PAGE_SIZE);
+
+		/*
+		* Implicit barriers also ensure the zeroed page is visible to the page
+		* table walker
+		*/
+		pte_clear_fixmap();
+
+		return phys;
+	}
+	#else
 	phys_addr_t phys;
 	void *ptr;
 
@@ -227,6 +332,45 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
 	pte_clear_fixmap();
 
 	return phys;
+	#endif
+}
+
+bool pgattr_change_is_safe(u64 old, u64 new)
+{
+	/*
+	 * The following mapping attributes may be updated in live
+	 * kernel mappings without the need for break-before-make.
+	 */
+	pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
+
+	/* creating or taking down mappings is always safe */
+	if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
+		return true;
+
+	/* A live entry's pfn should not change */
+	if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
+		return false;
+
+	/* live contiguous mappings may not be manipulated at all */
+	if ((old | new) & PTE_CONT)
+		return false;
+
+	/* Transitioning from Non-Global to Global is unsafe */
+	if (old & ~new & PTE_NG)
+		return false;
+
+	/*
+	 * Changing the memory type between Normal and Normal-Tagged is safe
+	 * since Tagged is considered a permission attribute from the
+	 * mismatched attribute aliases perspective.
+	 */
+	if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+	     (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
+	    ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+	     (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
+		mask |= PTE_ATTRINDX_MASK;
+
+	return ((old ^ new) & ~mask) == 0;
 }
 
 static void iee_init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
@@ -410,6 +554,92 @@ static void iee_alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long en
 	pud_clear_fixmap();
 }
 
+static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
+					unsigned long virt, phys_addr_t size,
+					pgprot_t prot,
+					phys_addr_t (*pgtable_alloc)(int),
+					int flags)
+{
+	unsigned long addr, end, next;
+	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+
+	/*
+	 * If the virtual and physical address don't have the same offset
+	 * within a page, we cannot map the region as the caller expects.
+	 */
+	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+		return;
+
+	phys &= PAGE_MASK;
+	addr = virt & PAGE_MASK;
+	end = PAGE_ALIGN(virt + size);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		iee_alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
+			       flags);
+		phys += next - addr;
+	} while (pgdp++, addr = next, addr != end);
+}
+
+void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int),
+				 int flags)
+{
+	mutex_lock(&fixmap_lock);
+	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
+				    pgtable_alloc, flags);
+	mutex_unlock(&fixmap_lock);
+}
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+extern __alias(__create_pgd_mapping_locked)
+void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+			     phys_addr_t size, pgprot_t prot,
+			     phys_addr_t (*pgtable_alloc)(int), int flags);
+#endif
+
+phys_addr_t __pgd_pgtable_alloc(int shift)
+{
+	void *ptr;
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		ptr = ptp_pg_alloc(&pg_cache, GFP_PGTABLE_KERNEL);
+	else
+		ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
+	#else
+	ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
+	#endif
+	IEE_CHECK(!ptr);
+
+	/* Ensure the zeroed page is visible to the page table walker */
+	dsb(ishst);
+	return __pa(ptr);
+}
+
+phys_addr_t pgd_pgtable_alloc(int shift)
+{
+	phys_addr_t pa = __pgd_pgtable_alloc(shift);
+	struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
+
+	/*
+	 * Call proper page table ctor in case later we need to
+	 * call core mm functions like apply_to_page_range() on
+	 * this pre-allocated page table.
+	 *
+	 * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
+	 * folded, and if so pagetable_pte_ctor() becomes nop.
+	 */
+	if (shift == PAGE_SHIFT)
+		IEE_CHECK(!pagetable_pte_ctor(ptdesc));
+	else if (shift == PMD_SHIFT)
+		IEE_CHECK(!pagetable_pmd_ctor(ptdesc));
+
+	return pa;
+}
+
 /* This function is almost the same with __create_pgd_mapping_locked()
  * but not permitting block descriptors larger than pmd block to simplify
  * page table opeartions like splitting blocks.
@@ -473,15 +703,34 @@ static void __init __create_pgd_mapping_for_iee_locked(pgd_t *pgdir, phys_addr_t
 			phys += next - addr;
 			continue;
 		}
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			iee_alloc_init_pud_pre_init(pgdp, addr, next, phys, prot, pgtable_alloc,
+					flags);
+		else
+			iee_alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
+					flags);
+		#else
 		iee_alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
 			       flags);
+		#endif
 
 		/* Set APTable RO on pgd entries of IEE mappings to prevent kernel access
 		 * when TCR.HPD1 == 0.
 		 */
 		p4d = READ_ONCE(*p4dp);
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			iee_set_pgtable_pre_init((unsigned long *)p4dp,
+					(unsigned long)(__phys_to_p4d_val(__p4d_to_phys(p4d))
+					| (PGD_APTABLE_RO | PGD_PXNTABLE | PGD_UXNTABLE | PUD_TYPE_TABLE)));
+		else
+				__p4d_populate(p4dp, __p4d_to_phys(p4d), (PGD_APTABLE_RO | PGD_PXNTABLE |
+				PGD_UXNTABLE | PUD_TYPE_TABLE));
+		#else
 		__p4d_populate(p4dp, __p4d_to_phys(p4d), (PGD_APTABLE_RO | PGD_PXNTABLE |
 				PGD_UXNTABLE | PUD_TYPE_TABLE));
+		#endif
 
 		phys += next - addr;
 	} while (pgdp++, addr = next, addr != end);
@@ -545,11 +794,53 @@ void __init iee_init_mappings(pgd_t *pgdp)
 		__map_memblock_for_iee(pgdp, start, end, SET_NG(SET_INVALID(PAGE_KERNEL)),
 					flags);
 	}
-
+	#ifdef CONFIG_PTP
+	iee_init_tcr_ptp();
+	#else
 	iee_init_tcr();
+	#endif
 	iee_setup_bootcpu_stack();
+	#ifdef CONFIG_PTP
+	pr_info("HAOC: CONFIG_PTP enabled.");
+	#endif
+}
+
+static void setup_iee_data_cache_bitmap(struct iee_early_alloc *cache,
+				enum HAOC_BITMAP_TYPE type)
+{
+	int block_nr = cache->curr_block_nr + 1;
+
+	for (int j = 0; j < block_nr; j++) {
+		iee_set_bitmap_type((unsigned long)__va(cache->blocks[j].start),
+				1 << cache->blocks[j].order, type);
+	}
+	#ifdef DEBUG
+	pr_info("IEE: Mark bitmap of %s block nr %d", cache->name, block_nr);
+	#endif
+}
+
+void __init setup_iee_early_data_bitmap(void)
+{
+	setup_iee_data_cache_bitmap(&iee_data, IEE_DATA);
+	setup_iee_data_cache_bitmap(&iee_stack, IEE_DATA);
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+	setup_iee_data_cache_bitmap(&iee_pgtable, IEE_PGTABLE);
+	#endif
 }
 
+#ifdef CONFIG_PTP
+static void setup_iee_early_address(struct iee_early_alloc *cache)
+{
+	for (int j = 0; j < cache->curr_block_nr+1; j++) {
+		for (int i = 0; i < (1 << cache->blocks[j].order); i++) {
+			set_iee_address_pre_init(__phys_to_iee(cache->blocks[j].start
+						+ i * PAGE_SIZE), true);
+		}
+	}
+}
+#endif
+
 static void prot_iee_early_data_cache(struct iee_early_alloc *cache)
 {
 	int block_nr = cache->curr_block_nr + 1;
@@ -567,8 +858,108 @@ void __init init_early_iee_data(void)
 	if (!haoc_enabled)
 		return;
 
+	#ifdef CONFIG_PTP
+	if (haoc_enabled){
+		/* Setup iee mappings of early allocated IEE objects to enable IEE. */
+		for (i = 0; ((unsigned long)idmap_pg_dir + i * PAGE_SIZE) <
+						(unsigned long)iee_init_data_end; i++) {
+			set_iee_address_pre_init(__phys_to_iee(__pa_symbol((unsigned long)idmap_pg_dir
+						+ i * PAGE_SIZE)), true);
+		}
+		setup_iee_early_address(&iee_pgtable);
+		setup_iee_early_address(&iee_data);
+
+		prot_iee_early_data_cache(&iee_pgtable);
+	}
+	#endif
+
 	for (i = 0; (iee_init_data_begin + i * PAGE_SIZE) < iee_init_data_end; i++)
 		set_iee_address(__phys_to_iee(__pa_symbol(iee_init_data_begin + i * PAGE_SIZE)),
 					0, true);
 	prot_iee_early_data_cache(&iee_stack);
 }
+
+#ifdef CONFIG_PTP
+void __init fixmap_copy_ptp(pgd_t *pgdir)
+{
+	if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdir, FIXADDR_TOT_START)))) {
+		/*
+		 * The fixmap falls in a separate pgd to the kernel, and doesn't
+		 * live in the carveout for the swapper_pg_dir. We can simply
+		 * re-use the existing dir for the fixmap.
+		 */
+		iee_set_pgd_pre_init(pgd_offset_pgd(pgdir, FIXADDR_TOT_START),
+			READ_ONCE(*pgd_offset_k(FIXADDR_TOT_START)));
+	} else if (CONFIG_PGTABLE_LEVELS > 3) {
+		pgd_t *bm_pgdp;
+		p4d_t *bm_p4dp;
+		pud_t *bm_pudp;
+		pudval_t pudval;
+		/*
+		 * The fixmap shares its top level pgd entry with the kernel
+		 * mapping. This can really only occur when we are running
+		 * with 16k/4 levels, so we can simply reuse the pud level
+		 * entry instead.
+		 */
+		IEE_CHECK(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+		bm_pgdp = pgd_offset_pgd(pgdir, FIXADDR_TOT_START);
+		bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_TOT_START);
+		bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_TOT_START);
+		pudval = PUD_TYPE_TABLE | PUD_TABLE_AF;
+		pudval |= PUD_TABLE_UXN;
+		iee_set_pgtable_pre_init((unsigned long *)bm_pudp,
+				(unsigned long)(__phys_to_pud_val(__pa_symbol(bm_pmd)) | pudval));
+		pud_clear_fixmap_pre_init();
+	} else {
+		BUG();
+	}
+}
+
+int __pmdp_set_access_flags(struct vm_area_struct *vma,
+			    unsigned long address, pmd_t *pmdp,
+			    pmd_t entry, int dirty)
+{
+	pmdval_t old_pmdval, pmdval;
+	pmd_t pmd = READ_ONCE(*pmdp);
+
+	if (pmd_same(pmd, entry))
+		return 0;
+
+	/* only preserve the access flags and write permission */
+	pmd_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
+
+	/*
+	 * Setting the flags must be done atomically to avoid racing with the
+	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
+	 * be set to the most permissive (lowest value) of *ptep and entry
+	 * (calculated as: a & b == ~(~a | ~b)).
+	 */
+	pmd_val(entry) ^= PTE_RDONLY;
+	pmdval = pmd_val(pmd);
+	do {
+		old_pmdval = pmdval;
+		pmdval ^= PTE_RDONLY;
+		pmdval |= pmd_val(entry);
+		pmdval ^= PTE_RDONLY;
+		pmdval = iee_set_pmd_cmpxchg_relaxed(pmdp, old_pmdval, pmdval);
+	} while (pmdval != old_pmdval);
+
+	/* Invalidate a stale read-only entry */
+	if (dirty)
+		flush_tlb_page(vma, address);
+	return 1;
+}
+
+void * __ref __ptp_vmemmap_alloc_block(unsigned long size, int node)
+{
+	int order = get_order(size);
+
+	/* If the main allocator is up use that, fallback to bootmem. */
+	if (slab_is_available())
+		return ptp_pg_alloc(&pg_cache, GFP_KERNEL | __GFP_ZERO);
+
+	if (order != 0)
+		panic("PTP: Unsupport vmemmap alloc.");
+	return __va(early_iee_pgtable_alloc(0));
+}
+#endif
diff --git a/arch/arm64/kernel/haoc/iee/iee-token.c b/arch/arm64/kernel/haoc/iee/iee-token.c
index a17a42aaf5c1..b99984423b75 100644
--- a/arch/arm64/kernel/haoc/iee/iee-token.c
+++ b/arch/arm64/kernel/haoc/iee/iee-token.c
@@ -36,11 +36,15 @@ void __init iee_prepare_init_task_token(void)
 					| __phys_to_pte_val(init_token_page));
 		/* Manaully go through IEE gates to bypass PTP checks. */
 		#ifdef CONFIG_PTP
-		write_sysreg(read_sysreg(TCR_EL1) | TCR_HPD1 | TCR_A1, tcr_el1);
-		isb();
-		WRITE_ONCE(*__ptr_to_iee(ptep), pte);
-		write_sysreg(read_sysreg(TCR_EL1) & ~(TCR_HPD1 | TCR_A1), tcr_el1);
-		isb();
+		if (haoc_enabled){
+			write_sysreg(read_sysreg(TCR_EL1) | TCR_HPD1 | TCR_A1, tcr_el1);
+			isb();
+			WRITE_ONCE(*__ptr_to_iee(ptep), pte);
+			write_sysreg(read_sysreg(TCR_EL1) & ~(TCR_HPD1 | TCR_A1), tcr_el1);
+			isb();
+		}
+		else
+			set_pte(ptep, pte);
 		#else
 		set_pte(ptep, pte);
 		#endif
diff --git a/arch/arm64/kernel/haoc/ptp/Makefile b/arch/arm64/kernel/haoc/ptp/Makefile
new file mode 100644
index 000000000000..c2887cf5c18c
--- /dev/null
+++ b/arch/arm64/kernel/haoc/ptp/Makefile
@@ -0,0 +1 @@
+obj-y += ptp.o iee-ptp-init.o
diff --git a/arch/arm64/kernel/haoc/ptp/iee-ptp-init.c b/arch/arm64/kernel/haoc/ptp/iee-ptp-init.c
new file mode 100644
index 000000000000..e0dcbb59dddd
--- /dev/null
+++ b/arch/arm64/kernel/haoc/ptp/iee-ptp-init.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/haoc/iee-init.h>
+#include <asm/haoc/iee-ptp-init.h>
+
+#define PTP_CHECK(condition) do {	\
+	if (unlikely(condition))	\
+		panic("PTP check failed on %s.", __func__);	\
+} while (0)
+
+#if CONFIG_PGTABLE_LEVELS > 3
+void __iee_p4d_populate_pre_init(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
+{
+	iee_set_p4d_pre_init(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
+}
+#else
+void __iee_p4d_populate_pre_init(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
+{
+	BUILD_BUG();
+}
+#endif
+
+void __iee_pud_populate_pre_init(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
+{
+	iee_set_pud_pre_init(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
+}
+
+void __iee_pmd_populate_pre_init(pmd_t *pmdp, phys_addr_t ptep,
+				  pmdval_t prot)
+{
+	iee_set_pmd_pre_init(pmdp, __pmd(__phys_to_pmd_val(ptep) | prot));
+}
+
+int iee_pud_set_huge_pre_init(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
+{
+	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
+
+	/* Only allow permission changes for now */
+	if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
+				   pud_val(new_pud)))
+		return 0;
+
+	WARN_ON_ONCE(phys & ~PUD_MASK);
+	iee_set_pud_pre_init(pudp, new_pud);
+	return 1;
+}
+
+int iee_pmd_set_huge_pre_init(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
+{
+	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
+
+	/* Only allow permission changes for now */
+	if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
+				   pmd_val(new_pmd)))
+		return 0;
+
+	WARN_ON_ONCE(phys & ~PMD_MASK);
+	iee_set_pmd_pre_init(pmdp, new_pmd);
+	return 1;
+}
+
+static inline pte_t *fixmap_pte(unsigned long addr)
+{
+	return &bm_pte[BM_PTE_TABLE_IDX(addr)][pte_index(addr)];
+}
+
+void __iee_set_fixmap_pre_init(enum fixed_addresses idx,
+			       phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long addr = __fix_to_virt(idx);
+	pte_t *ptep;
+
+	PTP_CHECK(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
+
+	ptep = fixmap_pte(addr);
+
+	if (pgprot_val(flags)) {
+		iee_set_pte_pre_init(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+	} else {
+		iee_set_pte_pre_init(ptep, __pte(0));
+		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
+	}
+}
+
+void iee_set_pgtable_pre_init(unsigned long *addr, unsigned long content)
+{
+	WRITE_ONCE(*addr, content);
+}
+
+void set_iee_address_pre_init(unsigned long addr, bool valid)
+{
+	pgd_t *pgdir = swapper_pg_dir;
+	pgd_t *pgdp = pgd_offset_pgd(pgdir, addr);
+
+	p4d_t *p4dp = p4d_offset(pgdp, addr);
+
+	pud_t *pudp = pud_offset(p4dp, addr);
+
+	pmd_t *pmdp = pmd_offset(pudp, addr);
+
+	pte_t *ptep = pte_offset_kernel(pmdp, addr);
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (valid)
+		pte = __pte(pte_val(pte) | PTE_VALID);
+	else
+		pte = __pte(pte_val(pte) & ~PTE_VALID);
+	iee_set_pgtable_pre_init((unsigned long *)ptep, (unsigned long)pte.pte);
+}
+
+static void iee_init_pte_pre_init(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		     phys_addr_t phys, pgprot_t prot)
+{
+	pte_t *ptep;
+
+	ptep = pte_set_fixmap_offset_pre_init(pmdp, addr);
+	do {
+		pte_t old_pte = __ptep_get(ptep);
+
+		iee_set_pgtable_pre_init((unsigned long *)ptep,
+				(unsigned long)(pfn_pte(__phys_to_pfn(phys), prot).pte));
+
+		/*
+		 * After the PTE entry has been populated once, we
+		 * only allow updates to the permission attributes.
+		 */
+		PTP_CHECK(!pgattr_change_is_safe(pte_val(old_pte),
+					      pte_val(__ptep_get(ptep))));
+
+		phys += PAGE_SIZE;
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	pte_clear_fixmap_pre_init();
+}
+
+static void iee_alloc_init_cont_pte_pre_init(pmd_t *pmdp, unsigned long addr,
+				unsigned long end, phys_addr_t phys,
+				pgprot_t prot,
+				phys_addr_t (*pgtable_alloc)(int),
+				int flags)
+{
+	unsigned long next;
+	pmd_t pmd = READ_ONCE(*pmdp);
+
+	PTP_CHECK(pmd_sect(pmd));
+	if (pmd_none(pmd)) {
+		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
+		phys_addr_t pte_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			pmdval |= PMD_TABLE_PXN;
+		PTP_CHECK(!pgtable_alloc);
+		pte_phys = pgtable_alloc(PAGE_SHIFT);
+		iee_set_pgtable_pre_init((unsigned long *)pmdp,
+					(unsigned long)(__phys_to_pmd_val(pte_phys) | pmdval));
+		pmd = READ_ONCE(*pmdp);
+	}
+	PTP_CHECK(pmd_bad(pmd));
+
+	do {
+		pgprot_t __prot = prot;
+
+		next = pte_cont_addr_end(addr, end);
+
+		/* use a contiguous mapping if the range is suitably aligned */
+		if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
+		    (flags & NO_CONT_MAPPINGS) == 0)
+			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+		iee_init_pte_pre_init(pmdp, addr, next, phys, __prot);
+
+		phys += next - addr;
+	} while (addr = next, addr != end);
+}
+
+static void iee_init_pmd_pre_init(pud_t *pudp, unsigned long addr, unsigned long end,
+		     phys_addr_t phys, pgprot_t prot,
+		     phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_set_fixmap_offset_pre_init(pudp, addr);
+	do {
+		pmd_t old_pmd = READ_ONCE(*pmdp);
+
+		next = pmd_addr_end(addr, end);
+
+		/* try section mapping first */
+		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
+		    (flags & NO_BLOCK_MAPPINGS) == 0) {
+			iee_set_pgtable_pre_init((unsigned long *)pmdp,
+				(unsigned long)(pfn_pmd(__phys_to_pfn(phys),
+				mk_pmd_sect_prot(prot)).pmd));
+
+			/*
+			 * After the PMD entry has been populated once, we
+			 * only allow updates to the permission attributes.
+			 */
+			PTP_CHECK(!pgattr_change_is_safe(pmd_val(old_pmd),
+						      READ_ONCE(pmd_val(*pmdp))));
+		} else {
+			iee_alloc_init_cont_pte_pre_init(pmdp, addr, next, phys, prot,
+					    pgtable_alloc, flags);
+
+			PTP_CHECK(pmd_val(old_pmd) != 0 &&
+			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
+		}
+		phys += next - addr;
+	} while (pmdp++, addr = next, addr != end);
+
+	pmd_clear_fixmap_pre_init();
+}
+
+static void iee_alloc_init_cont_pmd_pre_init(pud_t *pudp, unsigned long addr,
+				unsigned long end, phys_addr_t phys,
+				pgprot_t prot,
+				phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+	unsigned long next;
+	pud_t pud = READ_ONCE(*pudp);
+
+	/*
+	 * Check for initial section mappings in the pgd/pud.
+	 */
+	PTP_CHECK(pud_sect(pud));
+	if (pud_none(pud)) {
+		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
+		phys_addr_t pmd_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			pudval |= PUD_TABLE_PXN;
+		PTP_CHECK(!pgtable_alloc);
+		pmd_phys = pgtable_alloc(PMD_SHIFT);
+		iee_set_pgtable_pre_init((unsigned long *)pudp,
+					(unsigned long)(__phys_to_pud_val(pmd_phys) | pudval));
+		pud = READ_ONCE(*pudp);
+	}
+	PTP_CHECK(pud_bad(pud));
+
+	do {
+		pgprot_t __prot = prot;
+
+		next = pmd_cont_addr_end(addr, end);
+
+		/* use a contiguous mapping if the range is suitably aligned */
+		if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
+		    (flags & NO_CONT_MAPPINGS) == 0)
+			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+		iee_init_pmd_pre_init(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
+
+		phys += next - addr;
+	} while (addr = next, addr != end);
+}
+
+void iee_alloc_init_pud_pre_init(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			   phys_addr_t phys, pgprot_t prot,
+			   phys_addr_t (*pgtable_alloc)(int),
+			   int flags)
+{
+	unsigned long next;
+	pud_t *pudp;
+	p4d_t *p4dp = p4d_offset(pgdp, addr);
+	p4d_t p4d = READ_ONCE(*p4dp);
+
+	if (p4d_none(p4d)) {
+		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
+		phys_addr_t pud_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			p4dval |= P4D_TABLE_PXN;
+		PTP_CHECK(!pgtable_alloc);
+		pud_phys = pgtable_alloc(PUD_SHIFT);
+		iee_set_pgtable_pre_init((unsigned long *)p4dp,
+					(unsigned long)(__phys_to_p4d_val(pud_phys) | p4dval));
+		p4d = READ_ONCE(*p4dp);
+	}
+	PTP_CHECK(p4d_bad(p4d));
+
+	pudp = pud_set_fixmap_offset_pre_init(p4dp, addr);
+	do {
+		pud_t old_pud = READ_ONCE(*pudp);
+
+		next = pud_addr_end(addr, end);
+
+		iee_alloc_init_cont_pmd_pre_init(pudp, addr, next, phys, prot,
+					    pgtable_alloc, flags);
+
+		PTP_CHECK(pud_val(old_pud) != 0 &&
+			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
+		phys += next - addr;
+	} while (pudp++, addr = next, addr != end);
+
+	pud_clear_fixmap_pre_init();
+}
+
+static void __create_pgd_mapping_locked_pre_init(pgd_t *pgdir, phys_addr_t phys,
+					unsigned long virt, phys_addr_t size,
+					pgprot_t prot,
+					phys_addr_t (*pgtable_alloc)(int),
+					int flags)
+{
+	unsigned long addr, end, next;
+	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+
+	/*
+	 * If the virtual and physical address don't have the same offset
+	 * within a page, we cannot map the region as the caller expects.
+	 */
+	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+		return;
+
+	phys &= PAGE_MASK;
+	addr = virt & PAGE_MASK;
+	end = PAGE_ALIGN(virt + size);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		iee_alloc_init_pud_pre_init(pgdp, addr, next, phys, prot, pgtable_alloc,
+			       flags);
+		phys += next - addr;
+	} while (pgdp++, addr = next, addr != end);
+}
+
+void __create_pgd_mapping_pre_init(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int),
+				 int flags)
+{
+	mutex_lock(&fixmap_lock);
+	__create_pgd_mapping_locked_pre_init(pgdir, phys, virt, size, prot,
+				    pgtable_alloc, flags);
+	mutex_unlock(&fixmap_lock);
+}
+
+static void __init early_iounmap_after_init(void __iomem *addr, unsigned long size)
+{
+	unsigned long virt_addr;
+	unsigned long offset;
+	unsigned int nrpages;
+	enum fixed_addresses idx;
+	int i, slot;
+
+	slot = -1;
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+		if (prev_map[i] == addr) {
+			slot = i;
+			break;
+		}
+	}
+
+	if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+		 addr, size))
+		return;
+
+	if (WARN(prev_size[slot] != size,
+		 "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+		 addr, size, slot, prev_size[slot]))
+		return;
+
+	WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+	     addr, size, slot);
+
+	virt_addr = (unsigned long)addr;
+	if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+		return;
+
+	offset = offset_in_page(virt_addr);
+	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+	while (nrpages > 0) {
+		if (after_paging_init)
+			__late_clear_fixmap(idx);
+		else
+			__early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+		--idx;
+		--nrpages;
+	}
+	prev_map[slot] = NULL;
+}
+
+void __init efi_memmap_unmap_after_init(void)
+{
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
+	if (!(efi.memmap.flags & EFI_MEMMAP_LATE)) {
+		unsigned long size;
+
+		size = efi.memmap.desc_size * efi.memmap.nr_map;
+		early_iounmap_after_init((__force void __iomem *)efi.memmap.map, size);
+	} else {
+		memunmap(efi.memmap.map);
+	}
+
+	efi.memmap.map = NULL;
+	clear_bit(EFI_MEMMAP, &efi.flags);
+}
diff --git a/arch/arm64/kernel/haoc/ptp/ptp.c b/arch/arm64/kernel/haoc/ptp/ptp.c
new file mode 100644
index 000000000000..973542291a92
--- /dev/null
+++ b/arch/arm64/kernel/haoc/ptp/ptp.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/cmpxchg.h>
+#include <asm/haoc/haoc-def.h>
+#include <asm/haoc/iee.h>
+
+bool check_addr_in_iee_valid(unsigned long addr)
+{
+	pgd_t *pgdir = swapper_pg_dir;
+
+	pgd_t *pgdp = pgd_offset_pgd(pgdir, addr);
+	p4d_t *p4dp = p4d_offset(pgdp, addr);
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	// if (haoc_bitmap_ready)
+	// 	return iee_verify_not_normal(__iee_to_virt(addr),
+	// 				__iee_to_virt(addr+PAGE_SIZE));
+
+	if (!(p4d_val(READ_ONCE(*p4dp)) & PTE_VALID))
+		return false;
+
+	pudp = pud_offset(p4dp, addr);
+
+	if (!(pud_val(READ_ONCE(*pudp)) & PTE_VALID))
+		return false;
+
+	pmdp = pmd_offset(pudp, addr);
+
+	if (!(pmd_val(READ_ONCE(*pmdp)) & PTE_VALID))
+		return false;
+
+	ptep = pte_offset_kernel(pmdp, addr);
+
+	return (pte_val(READ_ONCE(*ptep)) & PTE_VALID);
+}
+
+// Return true if it is only changing prot of a pte.
+static inline bool is_changing_pte_prot(pte_t *ptep, pte_t pte)
+{
+	if(((pte_val(*ptep) ^ pte_val(pte)) & PTE_ADDR_MASK) == 0)
+		return true;
+	else
+		return false;
+}
+
+// Return true if the modify does not break DEP.
+static inline bool check_pte_dep(char *addr, pte_t pte)
+{
+	// DEP for kernel code and readonly data
+	// _text: .text start addr, __init_begin: .rodata end addr
+	if (addr >= _stext && addr < _etext) {
+		if ((PTE_WRITE & pte_val(pte)) // DBM == 1 --> writable
+			|| !(PTE_RDONLY & pte_val(pte))) { // DBM == 0 && AP[2] = 0 --> writable
+			panic("Can't make kernel's text/readonly page as writable!\n"
+					   "addr = 0x%16llx, pte_val = 0x%16llx",
+				  (u64)addr, pte_val(pte));
+		}
+	}
+	return true;
+}
+
+// Return true if the modify does not break DEP.
+static inline bool check_pmd_dep(char *addr, pmd_t pmd)
+{
+	// DEP for kernel code and readonly data
+	// _text: .text start addr, __init_begin: .rodata end addr
+	if (addr >= _stext && addr < _etext) {
+		if ((PTE_WRITE & pmd_val(pmd)) || // DBM == 1 --> writable
+			!(PTE_RDONLY & pmd_val(pmd))) { // DBM == 0 && AP[2] = 0 --> writable
+			panic("Can't make kernel's text/readonly page as writable!\n"
+					   "addr = 0x%16llx, pmd_val = 0x%16llx",
+				  (u64)addr, pmd_val(pmd));
+		}
+	}
+	return true;
+}
+
+void __iee_code _iee_set_static_pgd(int flag, pgd_t *pgdp, pgd_t pgd)
+{
+	if ((pgd_val(pgd) & PMD_TABLE_BIT) &&
+			!check_addr_in_iee_valid(__phys_to_iee(__pgd_to_phys(pgd))))
+		panic("You can't use non-iee-pgtable\n");
+
+	WRITE_ONCE(*((pgd_t *)(__phys_to_iee(__pa_symbol(pgdp)))), pgd);
+}
+
+void __iee_code _iee_set_bm_pte(int flag, pte_t *ptep, pte_t pte)
+{
+	WRITE_ONCE(*((pte_t *)(__phys_to_iee(__pa_symbol(ptep)))), pte);
+}
+
+void __iee_code _iee_set_pte(int flag, pte_t *ptep, pte_t pte)
+{
+	char *addr = (char *)__phys_to_kimg(__pte_to_phys(pte));
+
+	if (!(pte_val(pte) & PTE_VALID)) {
+		WRITE_ONCE(*((pte_t *)((unsigned long)ptep | IEE_OFFSET)), pte);
+		return;
+	}
+
+	// Avoid mapping a new VA to IEE PA.
+	if(!is_changing_pte_prot(ptep, pte) &&
+		check_addr_in_iee_valid(__phys_to_iee(__pte_to_phys(pte))))
+		panic("You are remmaping IEE page to other VA.\n");
+
+	// Avoid mapping a writable VA to kernel code PA.
+	if (!check_pte_dep(addr, pte))
+		return;
+
+	WRITE_ONCE(*((pte_t *)((unsigned long)ptep | IEE_OFFSET)), pte);
+}
+
+void __iee_code _iee_set_pmd(int flag, pmd_t *pmdp, pmd_t pmd)
+{
+	char *addr = (char *)__phys_to_kimg(__pmd_to_phys(pmd));
+
+	if (!(pmd_val(pmd) & PMD_SECT_VALID)) {
+		WRITE_ONCE(*((pmd_t *)((unsigned long)pmdp | IEE_OFFSET)), pmd);
+		return;
+	}
+
+	// Check if the pte table is legally allocated.
+	if ((pmd_val(pmd) & PMD_TABLE_BIT) &&
+			!check_addr_in_iee_valid(__phys_to_iee(__pmd_to_phys(pmd))))
+		panic("You can't use non-iee-pgtable\n");
+
+	// Avoid mapping a huge pmd to IEE physical page.
+	// if(!(pmd_val(pmd) & PMD_TABLE_BIT) && check_addr_range_in_iee_valid(pmd))
+	//  panic("Mapping IEE physical page to a huge pmd.\n");
+
+	if (!check_pmd_dep(addr, pmd))
+		return;
+
+	WRITE_ONCE(*((pmd_t *)((unsigned long)pmdp | IEE_OFFSET)), pmd);
+}
+
+void __iee_code _iee_set_pud(int flag, pud_t *pudp, pud_t pud)
+{
+	if (!(pud_val(pud) & PMD_SECT_VALID)) {
+		WRITE_ONCE(*((pud_t *)((unsigned long)pudp | IEE_OFFSET)), pud);
+		return;
+	}
+
+	if ((pud_val(pud) & PMD_TABLE_BIT) &&
+			!check_addr_in_iee_valid(__phys_to_iee(__pud_to_phys(pud))))
+		panic("You can't use non-iee-pgtable\n");
+
+	WRITE_ONCE(*((pud_t *)((unsigned long)pudp | IEE_OFFSET)), pud);
+}
+
+void __iee_code _iee_set_p4d(int flag, p4d_t *p4dp, p4d_t p4d)
+{
+	if (!(p4d_val(p4d) & PMD_SECT_VALID)) {
+		WRITE_ONCE(*((p4d_t *)((unsigned long)p4dp | IEE_OFFSET)), p4d);
+		return;
+	}
+
+	if ((p4d_val(p4d) & PMD_TABLE_BIT) &&
+			!check_addr_in_iee_valid(__phys_to_iee(__p4d_to_phys(p4d))))
+		panic("You can't use non-iee-pgtable\n");
+
+	WRITE_ONCE(*((p4d_t *)((unsigned long)p4dp | IEE_OFFSET)), p4d);
+}
+
+void __iee_code _iee_set_swapper_pgd(int flag, pgd_t *pgdp, pgd_t pgd)
+{
+	if (!(pgd_val(pgd) & PMD_SECT_VALID)) {
+		WRITE_ONCE(*((pgd_t *)(__phys_to_iee(__pa_symbol(pgdp)))), pgd);
+		return;
+	}
+
+	if ((pgd_val(pgd) & PMD_TABLE_BIT) &&
+			!check_addr_in_iee_valid(__phys_to_iee(__pgd_to_phys(pgd))))
+		panic("You can't use non-iee-pgtable\n");
+
+	WRITE_ONCE(*((pgd_t *)(__phys_to_iee(__pa_symbol(pgdp)))), pgd);
+}
+
+pteval_t __iee_code _iee_set_xchg_relaxed(int flag, pte_t *ptep, pteval_t pteval)
+{
+	pteval_t ret = xchg_relaxed((pteval_t *)((unsigned long)ptep | IEE_OFFSET), pteval);
+	return ret;
+}
+
+pmdval_t __iee_code _iee_set_pmd_xchg_relaxed(int flag, pmd_t *pmdp,
+							pmdval_t pmdval)
+{
+	pmdval_t ret = xchg_relaxed((pmdval_t *)((unsigned long)pmdp | IEE_OFFSET),
+									pmdval);
+	return ret;
+}
+
+pteval_t __iee_code _iee_set_cmpxchg_relaxed(int flag, pte_t *ptep,
+							pteval_t old_pteval, pteval_t new_pteval)
+{
+	pteval_t pteval = cmpxchg_relaxed((pteval_t *)((unsigned long)ptep | IEE_OFFSET),
+			old_pteval, new_pteval);
+
+	return pteval;
+}
+
+pmdval_t __iee_code _iee_set_pmd_cmpxchg_relaxed(int flag, pmd_t *pmdp,
+							pmdval_t old_pmdval, pmdval_t new_pmdval)
+{
+	pmdval_t pmdval = cmpxchg_relaxed((pmdval_t *)((unsigned long)pmdp | IEE_OFFSET),
+			old_pmdval, new_pmdval);
+
+	return pmdval;
+}
+
+void __iee_code _iee_set_sensitive_pte(int flag, pte_t *lm_ptep, pte_t *iee_ptep,
+						int order, int use_block_pmd)
+{
+	int i;
+
+	lm_ptep = (pte_t *)((unsigned long)lm_ptep | IEE_OFFSET);
+	iee_ptep = (pte_t *)((unsigned long)iee_ptep | IEE_OFFSET);
+	if (use_block_pmd) {
+		pmd_t pmd = __pmd(pte_val(READ_ONCE(*lm_ptep)));
+
+		pmd = __pmd((pmd_val(pmd) | PMD_SECT_RDONLY) & ~PTE_DBM);
+		WRITE_ONCE(*lm_ptep, __pte(pmd_val(pmd)));
+		for (i = 0; i < (1 << order); i++) {
+			pte_t pte = READ_ONCE(*iee_ptep);
+
+			pte = __pte(pte_val(pte) | PTE_VALID);
+			WRITE_ONCE(*iee_ptep, pte);
+			iee_ptep++;
+		}
+	} else {
+		for (i = 0; i < (1 << order); i++) {
+			pte_t pte = READ_ONCE(*lm_ptep);
+
+			pte = __pte((pte_val(pte) | PTE_RDONLY) & ~PTE_DBM);
+			WRITE_ONCE(*lm_ptep, pte);
+			pte = READ_ONCE(*iee_ptep);
+			pte = __pte(pte_val(pte) | PTE_VALID);
+			WRITE_ONCE(*iee_ptep, pte);
+			lm_ptep++;
+			iee_ptep++;
+		}
+	}
+}
+
+void __iee_code _iee_unset_sensitive_pte(int flag, pte_t *lm_ptep, pte_t *iee_ptep,
+						int order, int use_block_pmd)
+{
+	int i;
+
+	lm_ptep = (pte_t *)((unsigned long)lm_ptep | IEE_OFFSET);
+	iee_ptep = (pte_t *)((unsigned long)iee_ptep | IEE_OFFSET);
+	if (use_block_pmd) {
+		pmd_t pmd = __pmd(pte_val(READ_ONCE(*lm_ptep)));
+
+		pmd = __pmd(pmd_val(pmd) | PTE_DBM);
+		WRITE_ONCE(*lm_ptep, __pte(pmd_val(pmd)));
+		for (i = 0; i < (1 << order); i++) {
+			pte_t pte = READ_ONCE(*iee_ptep);
+
+			pte = __pte(pte_val(pte) & ~PTE_VALID);
+			WRITE_ONCE(*iee_ptep, pte);
+			iee_ptep++;
+		}
+	} else {
+		for (i = 0; i < (1 << order); i++) {
+			pte_t pte = READ_ONCE(*lm_ptep);
+
+			pte = __pte(pte_val(pte) | PTE_DBM);
+			WRITE_ONCE(*lm_ptep, pte);
+			pte = READ_ONCE(*iee_ptep);
+			pte = __pte(pte_val(pte) & ~PTE_VALID);
+			WRITE_ONCE(*iee_ptep, pte);
+			lm_ptep++;
+			iee_ptep++;
+		}
+	}
+}
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8a98897ac9bb..18fd57744da9 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -363,6 +363,11 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 		unflatten_device_tree();
 
 	bootmem_init();
+#ifdef CONFIG_PTP
+	if (haoc_enabled)
+	/* Prepare page pool for page tables. */
+	ptp_pg_cache_init(&pg_cache, 0, CONFIG_PGTABLE_LEVELS, "pg_cache");
+#endif
 
 	kasan_init();
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 58ec8230ebf7..e8fd6c75b01d 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -155,12 +155,24 @@ jiffies = jiffies_64;
 	. = ALIGN(PAGE_SIZE);
 #endif
 
+#ifdef CONFIG_PTP
+#define PTP_DATA \
+	. = ALIGN(PAGE_SIZE);		\
+	__iee_ptp_data_start = .;	\
+	*(.iee.ptp)					\
+	. = ALIGN(PAGE_SIZE);		\
+	__iee_ptp_data_end = .;
+#else
+#define PTP_DATA
+#endif
+
 #ifdef CONFIG_IEE
 #define IEE_INIT_DATA \
 	.iee.data : {		\
 	. = ALIGN(PAGE_SIZE);		\
 	iee_init_data_begin = .;	\
 	CRED_DATA	\
+	PTP_DATA	\
 	. = ALIGN(PAGE_SIZE);	\
 	iee_init_data_end = .;	\
 	}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b26a39db9970..8c58564a57a6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -253,7 +253,14 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
 		pteval ^= PTE_RDONLY;
 		pteval |= pte_val(entry);
 		pteval ^= PTE_RDONLY;
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			pteval = iee_set_cmpxchg_relaxed(ptep, old_pteval, pteval);
+		else
+			pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
+		#else
 		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
+		#endif
 	} while (pteval != old_pteval);
 
 	/* Invalidate a stale read-only entry */
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index a55b36d04590..c9f70fcd2d7f 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -15,6 +15,9 @@
 #include <asm/kernel-pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-ptp-init.h>
+#endif
 
 #define NR_BM_PTE_TABLES \
 	SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT)
@@ -28,9 +31,15 @@ static_assert(NR_BM_PMD_TABLES == 1);
 
 #define BM_PTE_TABLE_IDX(addr)	__BM_TABLE_IDX(addr, PMD_SHIFT)
 
+#ifdef CONFIG_PTP
+pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __section(".iee.ptp") __aligned(PAGE_SIZE);
+pmd_t bm_pmd[PTRS_PER_PMD] __section(".iee.ptp") __aligned(PAGE_SIZE) __maybe_unused;
+pud_t bm_pud[PTRS_PER_PUD] __section(".iee.ptp") __aligned(PAGE_SIZE) __maybe_unused;
+#else
 static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss;
 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
+#endif
 
 static inline pte_t *fixmap_pte(unsigned long addr)
 {
@@ -44,8 +53,12 @@ static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr)
 
 	if (pmd_none(pmd)) {
 		ptep = bm_pte[BM_PTE_TABLE_IDX(addr)];
+		#ifdef CONFIG_PTP
+		__iee_pmd_populate_pre_init(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE);
+		#else
 		__pmd_populate(pmdp, __pa_symbol(ptep),
 			       PMD_TYPE_TABLE | PMD_TABLE_AF);
+		#endif
 	}
 }
 
@@ -57,8 +70,12 @@ static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr,
 	pmd_t *pmdp;
 
 	if (pud_none(pud))
+		#ifdef CONFIG_PTP
+		__iee_pud_populate_pre_init(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE);
+		#else
 		__pud_populate(pudp, __pa_symbol(bm_pmd),
 			       PUD_TYPE_TABLE | PUD_TABLE_AF);
+		#endif
 
 	pmdp = pmd_offset_kimg(pudp, addr);
 	do {
@@ -85,8 +102,12 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr,
 	}
 
 	if (p4d_none(p4d))
+		#ifdef CONFIG_PTP
+		__iee_p4d_populate_pre_init(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE);
+		#else
 		__p4d_populate(p4dp, __pa_symbol(bm_pud),
 			       P4D_TYPE_TABLE | P4D_TABLE_AF);
+		#endif
 
 	pudp = pud_offset_kimg(p4dp, addr);
 	early_fixmap_init_pmd(pudp, addr, end);
@@ -124,9 +145,23 @@ void __set_fixmap(enum fixed_addresses idx,
 	ptep = fixmap_pte(addr);
 
 	if (pgprot_val(flags)) {
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			iee_set_bm_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+		else
+			__set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+		#else
 		__set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+		#endif
 	} else {
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			iee_set_bm_pte(ptep, __pte(0));
+		else
+			__pte_clear(&init_mm, addr, ptep);
+		#else
 		__pte_clear(&init_mm, addr, ptep);
+		#endif
 		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
 	}
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 71a4a7d7f0b7..81148a9c1848 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -44,9 +44,13 @@
 #include <asm/haoc/iee.h>
 #include <asm/haoc/iee-mmu.h>
 #include <asm/haoc/iee-init.h>
+#include <asm/haoc/iee-access.h>
 #ifdef CONFIG_IEE_SIP
 #include <asm/haoc/iee-si.h>
 #endif
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-ptp-init.h>
+#endif
 #endif
 
 #define NO_BLOCK_MAPPINGS	BIT(0)
@@ -81,6 +85,7 @@ long __section(".mmuoff.data.write") __early_cpu_boot_status;
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifndef CONFIG_IEE
 static DEFINE_SPINLOCK(swapper_pgdir_lock);
 static DEFINE_MUTEX(fixmap_lock);
 
@@ -460,6 +465,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
 
 	return pa;
 }
+#endif
 
 /*
  * This function can only be used to modify existing table entries,
@@ -474,8 +480,13 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
+	#ifdef CONFIG_PTP
+	__create_pgd_mapping_pre_init(init_mm.pgd, phys, virt, size, prot, NULL,
+			     NO_CONT_MAPPINGS);
+	#else
 	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 			     NO_CONT_MAPPINGS);
+	#endif
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -512,8 +523,17 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 				  phys_addr_t end, pgprot_t prot, int flags)
 {
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		__create_pgd_mapping_pre_init(pgdp, start, __phys_to_virt(start), end - start,
+			     prot, early_pgtable_alloc, flags);
+	else
+		__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
+				 prot, early_pgtable_alloc, flags);
+	#else
 	__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
 			     prot, early_pgtable_alloc, flags);
+	#endif
 }
 
 void __init mark_linear_text_alias_ro(void)
@@ -666,8 +686,17 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 	BUG_ON(!PAGE_ALIGNED(pa_start));
 	BUG_ON(!PAGE_ALIGNED(size));
 
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		__create_pgd_mapping_pre_init(pgdp, pa_start, (unsigned long)va_start, size, prot,
+			     early_pgtable_alloc, flags);
+	else
+		__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
+				 early_pgtable_alloc, flags);
+	#else
 	__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
 			     early_pgtable_alloc, flags);
+	#endif
 
 	if (!(vm_flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
@@ -698,7 +727,14 @@ static int __init map_entry_trampoline(void)
 	pgprot_val(prot) &= ~PTE_NG;
 
 	/* Map only the text into the trampoline page table */
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		iee_memset(__va(__pa_symbol(tramp_pg_dir)), 0, PGD_SIZE);
+	else
+		memset(tramp_pg_dir, 0, PGD_SIZE);
+	#else
 	memset(tramp_pg_dir, 0, PGD_SIZE);
+	#endif
 	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
 			     entry_tramp_text_size(), prot,
 			     __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
@@ -799,8 +835,14 @@ static void __init map_kernel(pgd_t *pgdp)
 #else	
 		map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 #endif
-		
-		fixmap_copy(pgdp);
+#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			fixmap_copy_ptp(pgdp);
+		else
+			fixmap_copy(pgdp);
+#else
+			fixmap_copy(pgdp);
+#endif
 		kasan_copy_shadow(pgdp);
 }
 
@@ -836,7 +878,15 @@ static void __init create_idmap(void)
 
 void __init paging_init(void)
 {
+	#ifdef CONFIG_PTP
+	pgd_t *pgdp;
+	if (haoc_enabled)
+		pgdp = pgd_set_fixmap_pre_init(__pa_symbol(swapper_pg_dir));
+	else
+		pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
+	#else
 	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
+	#endif
 	extern pgd_t init_idmap_pg_dir[];
 
 	idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
@@ -852,7 +902,14 @@ void __init paging_init(void)
 	iee_init_mappings(pgdp);
 	#endif
 
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		pgd_clear_fixmap_pre_init();
+	else
+		pgd_clear_fixmap();
+	#else
 	pgd_clear_fixmap();
+	#endif
 
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
 	init_mm.pgd = swapper_pg_dir;
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 4a64089e5771..243a1a93d863 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -14,6 +14,9 @@
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
+#ifdef CONFIG_PTP
+#include <asm/haoc/iee-func.h>
+#endif
 
 static struct kmem_cache *pgd_cache __ro_after_init;
 
@@ -22,7 +25,14 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	gfp_t gfp = GFP_PGTABLE_USER;
 
 	if (PGD_SIZE == PAGE_SIZE)
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			return ptp_pg_alloc(&pg_cache, gfp);
+		else
+			return (pgd_t *)__get_free_page(gfp);
+		#else
 		return (pgd_t *)__get_free_page(gfp);
+		#endif
 	else
 		return kmem_cache_alloc(pgd_cache, gfp);
 }
@@ -30,7 +40,14 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	if (PGD_SIZE == PAGE_SIZE)
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			ptp_pg_free(&pg_cache, pgd);
+		else
+			free_page((unsigned long)pgd);
+		#else
 		free_page((unsigned long)pgd);
+		#endif
 	else
 		kmem_cache_free(pgd_cache, pgd);
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 5139a28130c0..965022723fd5 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -252,7 +252,14 @@ int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
 		level_mask = GENMASK_ULL(level_msb, level_lsb);
 
 		index = (dst_addr & level_mask) >> level_lsb;
+		#ifdef CONFIG_PTP
+		if (haoc_enabled)
+			set_pte((pte_t *)(levels[this_level] + index), __pte(prev_level_entry));
+		else
+			*(levels[this_level] + index) = prev_level_entry;
+		#else
 		*(levels[this_level] + index) = prev_level_entry;
+		#endif
 
 		pfn = virt_to_pfn(levels[this_level]);
 		prev_level_entry = pte_val(pfn_pte(pfn,
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index 83092d93f36a..8179272c4fc0 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -94,7 +94,14 @@ static int __init arm_enable_runtime_services(void)
 		return 0;
 	}
 
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		efi_memmap_unmap_after_init();
+	else
+		efi_memmap_unmap();
+	#else
 	efi_memmap_unmap();
+	#endif
 
 	mapsize = efi.memmap.desc_size * efi.memmap.nr_map;
 
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index a5fbb6ed38ae..aa79aabb1b11 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -40,7 +40,14 @@ static void __iomem * __init earlycon_map(resource_size_t paddr, size_t size)
 {
 	void __iomem *base;
 #ifdef CONFIG_FIX_EARLYCON_MEM
+	#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+	if (haoc_enabled)
+		__iee_set_fixmap_pre_init(FIX_EARLYCON_MEM_BASE, paddr & PAGE_MASK, FIXMAP_PAGE_IO);
+	else
+		set_fixmap_io(FIX_EARLYCON_MEM_BASE, paddr & PAGE_MASK);
+	#else
 	set_fixmap_io(FIX_EARLYCON_MEM_BASE, paddr & PAGE_MASK);
+	#endif
 	base = (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
 	base += paddr & ~PAGE_MASK;
 #else
diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
index 45b42d8f6453..81481915cd00 100644
--- a/drivers/usb/early/ehci-dbgp.c
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -879,7 +879,14 @@ int __init early_dbgp_init(char *s)
 	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
 	 * than enough.  1K is the biggest I have seen.
 	 */
+	#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+	if (haoc_enabled)
+		__iee_set_fixmap_pre_init(FIX_DBGP_BASE, bar_val & PAGE_MASK, FIXMAP_PAGE_NOCACHE);
+	else
+		set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+	#else
 	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+	#endif
 	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
 	ehci_bar += bar_val & ~PAGE_MASK;
 	dbgp_printk("ehci_bar: %p\n", ehci_bar);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a60e9111cbb3..9b3868ff6ded 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -26,6 +26,9 @@
 #include <linux/uuid.h>
 
 #include <asm/page.h>
+#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+#include <asm/haoc/iee-ptp-init.h>
+#endif
 
 struct screen_info;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3eef1ec61af3..4807f6ede1dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -30,6 +30,9 @@
 #include <linux/kasan.h>
 #include <linux/memremap.h>
 #include <linux/slab.h>
+#ifdef CONFIG_PTP
+#include <linux/ptp-cache.h>
+#endif
 
 struct mempolicy;
 struct anon_vma;
@@ -2972,7 +2975,15 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
  */
 static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
 {
-	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+	struct page *page;
+#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		page = virt_to_page((unsigned long)ptp_pg_alloc(&pg_cache, gfp));
+	else
+		page = alloc_pages(gfp | __GFP_COMP, order);
+#else
+	page = alloc_pages(gfp | __GFP_COMP, order);
+#endif
 
 	return page_ptdesc(page);
 }
@@ -2988,7 +2999,14 @@ static inline void pagetable_free(struct ptdesc *pt)
 {
 	struct page *page = ptdesc_page(pt);
 
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		ptp_pg_free(&pg_cache, page_address(page));
+	else
+		__free_pages(page, compound_order(page));
+	#else
 	__free_pages(page, compound_order(page));
+	#endif
 }
 
 #if USE_SPLIT_PTE_PTLOCKS
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 6a7bd1b95ef9..3094ef5e2465 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -437,7 +437,14 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
 	 * X86 defined pmd_set_huge() verifies that the given
 	 * PMD is not a populated non-leaf entry.
 	 */
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		set_pmd(args->pmdp, __pmd(0));
+	else
+		WRITE_ONCE(*args->pmdp, __pmd(0));
+	#else
 	WRITE_ONCE(*args->pmdp, __pmd(0));
+	#endif
 	WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot));
 	WARN_ON(!pmd_clear_huge(args->pmdp));
 	pmd = READ_ONCE(*args->pmdp);
@@ -457,7 +464,14 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
 	 * X86 defined pud_set_huge() verifies that the given
 	 * PUD is not a populated non-leaf entry.
 	 */
+	#ifdef CONFIG_PTP
+	if (haoc_enabled)
+		set_pud(args->pudp, __pud(0));
+	else
+		WRITE_ONCE(*args->pudp, __pud(0));
+	#else
 	WRITE_ONCE(*args->pudp, __pud(0));
+	#endif
 	WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot));
 	WARN_ON(!pud_clear_huge(args->pudp));
 	pud = READ_ONCE(*args->pudp);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 9d4d27399f80..bb12c1a4384a 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -20,7 +20,11 @@
 #include "internal.h"
 
 #ifdef CONFIG_MMU
+#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+int early_ioremap_debug __initdata;
+#else
 static int early_ioremap_debug __initdata;
+#endif
 
 static int __init early_ioremap_debug_setup(char *str)
 {
@@ -30,7 +34,11 @@ static int __init early_ioremap_debug_setup(char *str)
 }
 early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
+#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+int after_paging_init __initdata;
+#else
 static int after_paging_init __initdata;
+#endif
 
 pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
 						    unsigned long size,
@@ -64,9 +72,15 @@ static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
 }
 #endif
 
+#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+#else
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+#endif
 
 void __init early_ioremap_setup(void)
 {
@@ -147,7 +161,14 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
 		if (after_paging_init)
 			__late_set_fixmap(idx, phys_addr, prot);
 		else
+			#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+			if (haoc_enabled)
+				__iee_set_fixmap_pre_init(idx, phys_addr, prot);
+			else
+				__early_set_fixmap(idx, phys_addr, prot);
+			#else
 			__early_set_fixmap(idx, phys_addr, prot);
+			#endif
 		phys_addr += PAGE_SIZE;
 		--idx;
 		--nrpages;
@@ -199,7 +220,14 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 		if (after_paging_init)
 			__late_clear_fixmap(idx);
 		else
+			#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+			if (haoc_enabled)
+				__iee_set_fixmap_pre_init(idx, 0, FIXMAP_PAGE_CLEAR);
+			else
+				__early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+			#else
 			__early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+			#endif
 		--idx;
 		--nrpages;
 	}
diff --git a/mm/haoc/ptp-pg_cache.c b/mm/haoc/ptp-pg_cache.c
index 3e0b10c98151..5ecbfb281d25 100644
--- a/mm/haoc/ptp-pg_cache.c
+++ b/mm/haoc/ptp-pg_cache.c
@@ -47,7 +47,7 @@ static void __ptp_set_iee_pages(unsigned long start_addr, unsigned long end_addr
 
 	addr = start_addr;
 	while (addr < end_addr) {
-		set_iee_page(addr, PMD_ORDER, IEE_PGTABLE);
+		set_iee_page(addr, PMD_ORDER);
 		addr += PMD_SIZE;
 	}
 	flush_tlb_kernel_range(start_addr, end_addr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 265b79827de8..39a548adeb94 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2547,7 +2547,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+	#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+	if (haoc_enabled)
+		WRITE_ONCE(*(&_pmd), __pmd(__phys_to_pmd_val(page_to_phys(pgtable))
+						| (PMD_TYPE_TABLE | PMD_TABLE_PXN)));
+	else
+		pmd_populate(mm, &_pmd, pgtable);
+	#else
 	pmd_populate(mm, &_pmd, pgtable);
+	#endif
 
 	pte = pte_offset_map(&_pmd, haddr);
 	VM_BUG_ON(!pte);
@@ -2719,7 +2727,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * This's critical for some architectures (Power).
 	 */
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+	#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+	if (haoc_enabled)
+		WRITE_ONCE(*(&_pmd), __pmd(__phys_to_pmd_val(page_to_phys(pgtable))
+						| (PMD_TYPE_TABLE | PMD_TABLE_PXN)));
+	else
+		pmd_populate(mm, &_pmd, pgtable);
+	#else
 	pmd_populate(mm, &_pmd, pgtable);
+	#endif
 
 	pte = pte_offset_map(&_pmd, haddr);
 	VM_BUG_ON(!pte);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c3353cd442a5..0bd834451504 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -30,6 +30,9 @@
 #include <linux/pgalloc.h>
 
 #include <asm/dma.h>
+#ifdef CONFIG_PTP
+extern void *__ptp_vmemmap_alloc_block(unsigned long size, int node);
+#endif
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
@@ -150,6 +153,32 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 		pte_t entry;
 		void *p;
 
+		#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+		if (haoc_enabled){
+			WARN_ONCE(reuse, "PTP: reuse in vmemmap.");
+			p = __ptp_vmemmap_alloc_block(PAGE_SIZE, node);
+			if (!p)
+				return NULL;
+		} else {
+			if (!reuse) {
+				p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+				if (!p)
+					return NULL;
+			} else {
+				/*
+				* When a PTE/PMD entry is freed from the init_mm
+				* there's a free_pages() call to this page allocated
+				* above. Thus this get_page() is paired with the
+				* put_page_testzero() on the freeing path.
+				* This can only called by certain ZONE_DEVICE path,
+				* and through vmemmap_populate_compound_pages() when
+				* slab is available.
+				*/
+				get_page(reuse);
+				p = page_to_virt(reuse);
+			}
+		}
+		#else
 		if (!reuse) {
 			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 			if (!p)
@@ -167,6 +196,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 			get_page(reuse);
 			p = page_to_virt(reuse);
 		}
+		#endif
 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
@@ -175,11 +205,28 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 
 static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
 {
-	void *p = vmemmap_alloc_block(size, node);
+	void *p = NULL;
+
+	#if defined(CONFIG_PTP) && defined(CONFIG_ARM64)
+	if (haoc_enabled){
+		p = __ptp_vmemmap_alloc_block(size, node);
+
+		if (!p)
+			return NULL;
+	} else {
+		p = vmemmap_alloc_block(size, node);
+
+		if (!p)
+			return NULL;
+		memset(p, 0, size);
+	}
+	#else
+	p = vmemmap_alloc_block(size, node);
 
 	if (!p)
 		return NULL;
 	memset(p, 0, size);
+	#endif
 
 	return p;
 }
-- 
Gitee


From 74db99b1d50ac886fcb059e5e76880460f227e8a Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:22 +0800
Subject: [PATCH 11/59] irqchip/gic-v3-its: Introduce the reserved device ID
 pools

commit 87f09e449b654411ffa6362174171c291d7e2b41 openEuler

Organize the system-wide unused device IDs to build several device ID
pools. Use these pools to manage the reserved device IDs which can be
used by virtual devices (without the actual HW device ID).

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/irqchip/irq-gic-v3-its.c | 122 +++++++++++++++++++++++++++++++
 drivers/misc/Kconfig             |   9 +++
 2 files changed, 131 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 2969bb07ec93..4cd9738ca467 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -46,6 +46,104 @@
 
 #include "irq-gic-common.h"
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+#include <linux/pci.h>
+
+/* a reserved bus id region */
+struct plat_rsv_buses {
+	u8	start;	/* the first reserved bus id */
+	u8	count;
+};
+
+/*
+ * Build a devid pool per reserved bus id region, where all
+ * device ids should be unused by physical PCI devices.
+ */
+struct rsv_devid_pool {
+	struct list_head	entry;
+
+	struct plat_rsv_buses	buses;
+	u32			start;
+	u32			end;
+
+	raw_spinlock_t		devid_bm_lock;
+	unsigned long		*devid_bm;
+};
+
+static LIST_HEAD(rsv_devid_pools);
+static DEFINE_RAW_SPINLOCK(rsv_devid_pools_lock);
+
+/* Do we have usable rsv_devid_pool? Initialized to be true. */
+static bool rsv_devid_pool_cap = true;
+static u8 rsv_buses_start, rsv_buses_count;
+
+static int __init rsv_buses_start_cfg(char *buf)
+{
+	return kstrtou8(buf, 0, &rsv_buses_start);
+}
+early_param("irqchip.gicv3_rsv_buses_start", rsv_buses_start_cfg);
+
+static int __init rsv_buses_count_cfg(char *buf)
+{
+	return kstrtou8(buf, 0, &rsv_buses_count);
+}
+early_param("irqchip.gicv3_rsv_buses_count", rsv_buses_count_cfg);
+
+static void get_rsv_buses_resource(struct plat_rsv_buses *buses)
+{
+	buses->start = rsv_buses_start;
+	buses->count = rsv_buses_count;
+
+	/*
+	 * FIXME: There is no architectural way to get the *correct*
+	 * reserved bus id info.
+	 *
+	 * The first thought is to increase the GITS_TYPER.Devbits for
+	 * the usage for virtualization, but this will break all
+	 * command layouts with DeviceID as an argument (e.g., INT).
+	 *
+	 * The second way is to decrease the GITS_TYPER.Devids so that
+	 * SW can pick the unused device IDs for use (these IDs should
+	 * actually be supported at HW level, though not exposed).
+	 * *Or* fetch the information with the help of firmware. They
+	 * are essentially the same way.
+	 */
+}
+
+static int probe_devid_pool_one(void)
+{
+	struct rsv_devid_pool *devid_pool;
+
+	devid_pool = kzalloc(sizeof(*devid_pool), GFP_KERNEL);
+	if (!devid_pool)
+		return -ENOMEM;
+
+	get_rsv_buses_resource(&devid_pool->buses);
+	raw_spin_lock_init(&devid_pool->devid_bm_lock);
+
+	devid_pool->start = PCI_DEVID(devid_pool->buses.start, 0);
+	devid_pool->end = PCI_DEVID(devid_pool->buses.start + devid_pool->buses.count, 0);
+
+	if (devid_pool->end == devid_pool->start) {
+		kfree(devid_pool);
+		return -EINVAL;
+	}
+
+	devid_pool->devid_bm = bitmap_zalloc(devid_pool->end - devid_pool->start,
+					     GFP_KERNEL);
+	if (!devid_pool->devid_bm) {
+		kfree(devid_pool);
+		return -ENOMEM;
+	}
+
+	raw_spin_lock(&rsv_devid_pools_lock);
+	list_add(&devid_pool->entry, &rsv_devid_pools);
+	raw_spin_unlock(&rsv_devid_pools_lock);
+
+	return 0;
+}
+#endif
+
 #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING		(1ULL << 0)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_22375	(1ULL << 1)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_23144	(1ULL << 2)
@@ -210,6 +308,22 @@ static DEFINE_IDA(its_vpeid_ida);
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+/*
+ * Currently we only build *one* devid pool.
+ */
+static int build_devid_pools(void)
+{
+	struct its_node *its;
+
+	its = list_first_entry(&its_nodes, struct its_node, entry);
+	if (readl_relaxed(its->base + GITS_IIDR) != 0x00051736)
+		return -EINVAL;
+
+	return probe_devid_pool_one();
+}
+#endif
+
 static struct page *its_alloc_pages_node(int node, gfp_t gfp,
 					 unsigned int order)
 {
@@ -5919,6 +6033,14 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 			rdists->has_vlpis = false;
 			pr_err("ITS: Disabling GICv4 support\n");
 		}
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+		if (build_devid_pools())
+			rsv_devid_pool_cap = false;
+
+		if (rsv_devid_pool_cap)
+			pr_info("ITS: reserved device id pools enabled\n");
+#endif
 	}
 
 	register_syscore_ops(&its_syscore_ops);
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2916a2ac3804..452e8ea8bfd7 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -502,6 +502,15 @@ config HISI_HIKEY_USB
 	  switching between the dual-role USB-C port and the USB-A host ports
 	  using only one USB controller.
 
+config VIRT_PLAT_DEV
+	bool "virt platform device driver"
+        depends on KVM && ARM64 && ARCH_HISI
+        default n
+        help
+          Enable this configuration option to probe the virtual platform device,
+          which created for the Qemu emulated device to implement virtual MSI
+          direct injection.
+
 config OPEN_DICE
 	tristate "Open Profile for DICE driver"
 	depends on OF_RESERVED_MEM
-- 
Gitee


From 6274b36ce64a740bffb25df1a9cf81591b452f74 Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:23 +0800
Subject: [PATCH 12/59] irqchip/gic-v3-its: Alloc/Free device id from pools for
 virtual devices

commit 7b39fd06a39128ca3804691812f65dd307806734 openEuler

Alloc/Free device id from pools for virtual devices
and plug the helpers into its_msi_prepare()/its_free_device().

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/irqchip/irq-gic-v3-its.c | 95 ++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4cd9738ca467..af29721d64e4 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -276,6 +276,12 @@ struct its_device {
 	u32			nr_ites;
 	u32			device_id;
 	bool			shared;
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	/* For virtual devices which needed the devid managed */
+	bool			is_vdev;
+	struct rsv_devid_pool	*devid_pool;
+#endif
 };
 
 static struct {
@@ -303,6 +309,60 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock);
 
 static DEFINE_IDA(its_vpeid_ida);
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+static void free_devid_to_rsv_pools(struct its_device *its_dev)
+{
+	struct rsv_devid_pool *pool = its_dev->devid_pool;
+	u32 id, size;
+
+	WARN_ON(!pool);
+
+	id = its_dev->device_id - pool->start;
+	size = pool->end - pool->start;
+	WARN_ON(id >= size);
+
+	raw_spin_lock(&pool->devid_bm_lock);
+	clear_bit(id, pool->devid_bm);
+	raw_spin_unlock(&pool->devid_bm_lock);
+
+	pr_debug("ITS: free devid (%u) to rsv_devid_pools\n", its_dev->device_id);
+}
+
+static int alloc_devid_from_rsv_pools(struct rsv_devid_pool **devid_pool,
+				      u32 *dev_id)
+{
+	struct rsv_devid_pool *pool;
+	int err = -ENOSPC;
+
+	raw_spin_lock(&rsv_devid_pools_lock);
+	list_for_each_entry(pool, &rsv_devid_pools, entry) {
+		u32 size, id;
+
+		size = pool->end - pool->start;
+
+		raw_spin_lock(&pool->devid_bm_lock);
+		id = find_first_zero_bit(pool->devid_bm, size);
+		if (id >= size) {
+			/* No usable device id in this pool, try next. */
+			raw_spin_unlock(&pool->devid_bm_lock);
+			continue;
+		}
+
+		*dev_id = pool->start + id;
+		set_bit(id, pool->devid_bm);
+		raw_spin_unlock(&pool->devid_bm_lock);
+
+		*devid_pool = pool;
+		err = 0;
+		break;
+	}
+	raw_spin_unlock(&rsv_devid_pools_lock);
+
+	pr_debug("ITS: alloc devid (%u) from rsv_devid_pools\n", *dev_id);
+	return err;
+}
+#endif
+
 #define gic_data_rdist()		(raw_cpu_ptr(gic_rdists->rdist))
 #define gic_data_rdist_cpu(cpu)		(per_cpu_ptr(gic_rdists->rdist, cpu))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
@@ -3747,6 +3807,13 @@ static void its_free_device(struct its_device *its_dev)
 	raw_spin_unlock_irqrestore(&its_dev->its->lock, flags);
 	kfree(its_dev->event_map.col_map);
 	itt_free_pool(its_dev->itt, its_dev->itt_sz);
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	if (its_dev->is_vdev) {
+		WARN_ON(!rsv_devid_pool_cap);
+		free_devid_to_rsv_pools(its_dev);
+	}
+#endif
 	kfree(its_dev);
 }
 
@@ -3783,6 +3850,23 @@ static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
 	 */
 	dev_id = info->scratchpad[0].ul;
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+	int use_devid_pool = false;
+	struct rsv_devid_pool *pool = NULL;
+
+	if (rsv_devid_pool_cap && !dev->of_node && !dev->fwnode &&
+	    info->scratchpad[0].ul == -1)
+		use_devid_pool = true;
+
+	if (use_devid_pool) {
+		err = alloc_devid_from_rsv_pools(&pool, &dev_id);
+		if (err) {
+			pr_warn("ITS: No remaining device id\n");
+			return err;
+		}
+	}
+#endif
+
 	msi_info = msi_get_domain_info(domain);
 	its = msi_info->data;
 
@@ -3799,6 +3883,10 @@ static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
 	mutex_lock(&its->dev_alloc_lock);
 	its_dev = its_find_device(its, dev_id);
 	if (its_dev) {
+#ifdef CONFIG_VIRT_PLAT_DEV
+		/* Impossible ...*/
+		WARN_ON_ONCE(use_devid_pool);
+#endif
 		/*
 		 * We already have seen this ID, probably through
 		 * another alias (PCI bridge of some sort). No need to
@@ -3819,6 +3907,13 @@ static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
 		its_dev->shared = true;
 
 	pr_debug("ITT %d entries, %d bits\n", nvec, ilog2(nvec));
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	if (use_devid_pool) {
+		its_dev->is_vdev = true;
+		its_dev->devid_pool = pool;
+	}
+#endif
 out:
 	mutex_unlock(&its->dev_alloc_lock);
 	info->scratchpad[0].ptr = its_dev;
-- 
Gitee


From e412b3911000830dda281fb2efecebb655e33b77 Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:24 +0800
Subject: [PATCH 13/59] irqchip/gic-v3-its: Add virt platform devices MSI
 support

commit 4cc685056a0056a0f8bdcfe901000dd4731efc6b openEuler

Implement the virtual platform device MSI with the GICv3 ITS.
Compared with phycial platform device msi, msi_prepare implementation
need consider the devid alloc.

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/irqchip/irq-gic-v3-its-platform-msi.c | 45 +++++++++++++++++--
 drivers/irqchip/irq-gic-v3-its.c              |  2 +-
 include/linux/msi.h                           |  4 ++
 3 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
index daa6d5053bc3..f6694b291ca6 100644
--- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
@@ -10,6 +10,20 @@
 #include <linux/of.h>
 #include <linux/of_irq.h>
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+static struct irq_domain *vp_irq_domain;
+extern bool rsv_devid_pool_cap;
+
+struct irq_domain *vp_get_irq_domain(void)
+{
+	if (!vp_irq_domain)
+		pr_err("virtual platform irqdomain hasn't be initialized!\n");
+
+	return vp_irq_domain;
+}
+EXPORT_SYMBOL_GPL(vp_get_irq_domain);
+#endif
+
 static struct irq_chip its_pmsi_irq_chip = {
 	.name			= "ITS-pMSI",
 };
@@ -52,6 +66,19 @@ static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
 
 	msi_info = msi_get_domain_info(domain->parent);
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+	if (rsv_devid_pool_cap && !dev->of_node && !dev->fwnode) {
+		WARN_ON_ONCE(domain != vp_irq_domain);
+		/*
+		 * virtual platform device doesn't have a DeviceID which
+		 * will be allocated with core ITS's help.
+		 */
+		info->scratchpad[0].ul = -1;
+
+		goto vdev_pmsi_prepare;
+	}
+#endif
+
 	if (dev->of_node)
 		ret = of_pmsi_get_dev_id(domain, dev, &dev_id);
 	else
@@ -62,6 +89,9 @@ static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
 	/* ITS specific DeviceID, as the core ITS ignores dev. */
 	info->scratchpad[0].ul = dev_id;
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+vdev_pmsi_prepare:
+#endif
 	/* Allocate at least 32 MSIs, and always as a power of 2 */
 	nvec = max_t(int, 32, roundup_pow_of_two(nvec));
 	return msi_info->ops->msi_prepare(domain->parent,
@@ -86,7 +116,7 @@ static const struct of_device_id its_device_id[] = {
 static int __init its_pmsi_init_one(struct fwnode_handle *fwnode,
 				const char *name)
 {
-	struct irq_domain *parent;
+	struct irq_domain *pmsi_irqdomain, *parent;
 
 	parent = irq_find_matching_fwnode(fwnode, DOMAIN_BUS_NEXUS);
 	if (!parent || !msi_get_domain_info(parent)) {
@@ -94,13 +124,22 @@ static int __init its_pmsi_init_one(struct fwnode_handle *fwnode,
 		return -ENXIO;
 	}
 
-	if (!platform_msi_create_irq_domain(fwnode, &its_pmsi_domain_info,
-					    parent)) {
+	pmsi_irqdomain = platform_msi_create_irq_domain(fwnode,
+							&its_pmsi_domain_info,
+							parent);
+	if (!pmsi_irqdomain) {
 		pr_err("%s: unable to create platform domain\n", name);
 		return -ENXIO;
 	}
 
 	pr_info("Platform MSI: %s domain created\n", name);
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	/* Should we take other irqdomains into account? */
+	if (!vp_irq_domain)
+		vp_irq_domain = pmsi_irqdomain;
+#endif
+
 	return 0;
 }
 
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index af29721d64e4..03ce9ad2f1d9 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -74,7 +74,7 @@ static LIST_HEAD(rsv_devid_pools);
 static DEFINE_RAW_SPINLOCK(rsv_devid_pools_lock);
 
 /* Do we have usable rsv_devid_pool? Initialized to be true. */
-static bool rsv_devid_pool_cap = true;
+bool rsv_devid_pool_cap = true;
 static u8 rsv_buses_start, rsv_buses_count;
 
 static int __init rsv_buses_start_cfg(char *buf)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0b5bca6dd071..9aad1f721015 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -680,6 +680,10 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct irq_domain *parent);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+struct irq_domain *vp_get_irq_domain(void);
+#endif
 #else /* CONFIG_PCI_MSI */
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 {
-- 
Gitee


From b81767a53868af815fdbdcb5fb24c467627bc2b5 Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:25 +0800
Subject: [PATCH 14/59] virt_plat_dev: Register the virt platform device driver

commit 8597aa42085141b113855f552ba9132990cad84a openEuler

This driver porbed the virt platform device by name, and
it just take up a few MSIs through the vp_irqdomain.

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/misc/Makefile        |   1 +
 drivers/misc/virt_plat_dev.c | 118 +++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 drivers/misc/virt_plat_dev.c

diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index ccf5456e1d88..16fe31fbf7d4 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_PVPANIC)   	+= pvpanic/
 obj-$(CONFIG_UACCE)		+= uacce/
 obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
 obj-$(CONFIG_HISI_HIKEY_USB)	+= hisi_hikey_usb.o
+obj-$(CONFIG_VIRT_PLAT_DEV)	+= virt_plat_dev.o
 obj-$(CONFIG_HI6421V600_IRQ)	+= hi6421v600-irq.o
 obj-$(CONFIG_OPEN_DICE)		+= open-dice.o
 obj-$(CONFIG_GP_PCI1XXXX)	+= mchp_pci1xxxx/
diff --git a/drivers/misc/virt_plat_dev.c b/drivers/misc/virt_plat_dev.c
new file mode 100644
index 000000000000..7a3d317a2572
--- /dev/null
+++ b/drivers/misc/virt_plat_dev.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019-2020 HUAWEI TECHNOLOGIES CO., LTD., All Rights Reserved.
+ * Author: Wanghaibin <wanghaibin.wang@huawei.com>
+ */
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/platform_device.h>
+#include <linux/uaccess.h>
+#include <linux/device.h>
+
+#define VIRT_DEV_DEBUG 1
+
+#ifdef VIRT_DEV_DEBUG
+#define virtdev_info(fmt, ...)	pr_info("virdev: " fmt, ## __VA_ARGS__)
+#else
+#define virtdev_info(fmt, ...)
+#endif
+
+static irqreturn_t virt_irq_handle(int irq, void *data)
+{
+	return IRQ_HANDLED;
+}
+
+static void virt_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+}
+
+static int virt_device_probe(struct platform_device *pdev)
+{
+	struct msi_desc *desc;
+	unsigned int *drvdata = dev_get_drvdata(&pdev->dev);
+	unsigned int nvec = *drvdata;
+	struct irq_domain *vp_irqdomain = vp_get_irq_domain();
+	int ret;
+
+	if (!vp_irqdomain)
+		return -ENXIO;
+
+	virtdev_info("Allocate platform msi irqs nvecs: %d\n", nvec);
+	dev_set_msi_domain(&pdev->dev, vp_irqdomain);
+
+	ret = platform_msi_domain_alloc_irqs(&pdev->dev, nvec,
+					     virt_write_msi_msg);
+	if (ret) {
+		pr_err("Allocate platform msi irqs failed %d\n", ret);
+		goto error;
+	}
+
+	virtdev_info("Allocate platform msi irqs succeed\n");
+	msi_for_each_desc(desc, &pdev->dev, MSI_DESC_ALL) {
+		virtdev_info("Request irq %d\n", desc->irq);
+		ret = request_irq(desc->irq, virt_irq_handle, 0,
+				  "virt_dev_host", pdev);
+		if (ret) {
+			pr_err("Request irq %d failed %d\n", desc->irq, ret);
+			goto error_free_irqs;
+		}
+	}
+
+	virtdev_info("Init virtual platform device driver successfully.\n");
+	return 0;
+
+error_free_irqs:
+	msi_for_each_desc(desc, &pdev->dev, MSI_DESC_ALL)
+		free_irq(desc->irq, pdev);
+
+	platform_msi_domain_free_irqs(&pdev->dev);
+error:
+	return ret;
+}
+
+static int virt_device_remove(struct platform_device *pdev)
+{
+	struct msi_desc *desc;
+
+	msi_for_each_desc(desc, &pdev->dev, MSI_DESC_ALL)
+		free_irq(desc->irq, pdev);
+
+	platform_msi_domain_free_irqs(&pdev->dev);
+
+	return 0;
+}
+
+static struct platform_driver virtdev_driver = {
+	.driver = {
+		/* Using the device & driver name to match each other */
+		.name = "virt_plat_dev",
+	},
+	.probe = virt_device_probe,
+	.remove = virt_device_remove,
+};
+
+static int __init virtdev_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&virtdev_driver);
+	if (ret) {
+		pr_err("Register virtdev platform driver failed (%d)\n", ret);
+		return ret;
+	}
+
+	virtdev_info("Register virtdev platform driver succeed.\n");
+	return 0;
+}
+module_init(virtdev_init);
+
+static void  __exit virtdev_exit(void)
+{
+	platform_driver_unregister(&virtdev_driver);
+}
+module_exit(virtdev_exit);
+
+MODULE_LICENSE("GPL");
-- 
Gitee


From 096c01709c5fc22c3ee5c1ec210ad8b5e14aabba Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:26 +0800
Subject: [PATCH 15/59] KVM: arm64: Introduce shadow device

commit 013f589495b79882a74fe3303e701effb4d26723 openEuler

The shadow device implement that establish relationships between
virtual devices and back-end virtual platform devices.

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/Makefile          |   1 +
 arch/arm64/kvm/arm.c             |  44 +++++
 arch/arm64/kvm/vgic/shadow_dev.c | 327 +++++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic-init.c  |   4 +
 include/kvm/arm_vgic.h           |  30 +++
 include/uapi/linux/kvm.h         |   9 +
 6 files changed, 415 insertions(+)
 create mode 100644 arch/arm64/kvm/vgic/shadow_dev.c

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 2eec980cbe5c..9221291d99a6 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -23,6 +23,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 vgic/vgic-its.o vgic/vgic-debug.o \
 	 rme.o rme-exit.o
 
+kvm-$(CONFIG_VIRT_PLAT_DEV)  += vgic/shadow_dev.o
 kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
 
 always-y := hyp_constants.h hyp-constants.s
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9f8174d52e24..afc7869a6c12 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -456,6 +456,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_ARM64_HDBSS
 	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
 		r = system_supports_hdbss();
+#ifdef CONFIG_VIRT_PLAT_DEV
+	case KVM_CAP_ARM_VIRT_MSI_BYPASS:
+		r = sdev_enable;
 		break;
 #endif
 	default:
@@ -1869,6 +1872,36 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 
 		return kvm_vm_set_attr(kvm, &attr);
 	}
+#ifdef CONFIG_VIRT_PLAT_DEV
+	case KVM_CREATE_SHADOW_DEV: {
+		struct kvm_master_dev_info *mdi;
+		u32 nvectors;
+		int ret;
+
+		if (get_user(nvectors, (const u32 __user *)argp))
+			return -EFAULT;
+		if (!nvectors)
+			return -EINVAL;
+
+		mdi = memdup_user(argp, sizeof(*mdi) + nvectors * sizeof(mdi->msi[0]));
+		if (IS_ERR(mdi))
+			return PTR_ERR(mdi);
+
+		ret = kvm_shadow_dev_create(kvm, mdi);
+		kfree(mdi);
+
+		return ret;
+	}
+	case KVM_DEL_SHADOW_DEV: {
+		u32 devid;
+
+		if (get_user(devid, (const u32 __user *)argp))
+			return -EFAULT;
+
+		kvm_shadow_dev_delete(kvm, devid);
+		return 0;
+	}
+#endif
 	default:
 		return -EINVAL;
 	}
@@ -2633,6 +2666,13 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
 	kvm_arm_resume_guest(irqfd->kvm);
 }
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+	kvm_shadow_dev_delete_all(kvm);
+}
+#endif
+
 /* Initialize Hyp-mode and memory mappings on all CPUs */
 static __init int kvm_arm_init(void)
 {
@@ -2713,6 +2753,10 @@ static __init int kvm_arm_init(void)
 
 	kvm_arm_initialised = true;
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+	kvm_shadow_dev_init();
+#endif
+
 	return 0;
 
 out_subs:
diff --git a/arch/arm64/kvm/vgic/shadow_dev.c b/arch/arm64/kvm/vgic/shadow_dev.c
new file mode 100644
index 000000000000..3b1210954a94
--- /dev/null
+++ b/arch/arm64/kvm/vgic/shadow_dev.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019-2020 HUAWEI TECHNOLOGIES CO., LTD., All Rights Reserved.
+ * Author: Wanghaibin <wanghaibin.wang@huawei.com>
+ */
+
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/platform_device.h>
+#include <linux/uaccess.h>
+
+static struct workqueue_struct *sdev_cleanup_wq;
+static bool virt_msi_bypass;
+bool sdev_enable;
+
+static void shadow_dev_destroy(struct work_struct *work);
+static void sdev_virt_pdev_delete(struct platform_device *pdev);
+
+int shadow_dev_virq_bypass_inject(struct kvm *kvm,
+				  struct kvm_kernel_irq_routing_entry *e)
+{
+	struct shadow_dev *sdev = e->cache.data;
+	u32 vec = e->msi.data;
+	u32 host_irq = sdev->host_irq[vec];
+	int ret;
+
+	ret = irq_set_irqchip_state(host_irq, IRQCHIP_STATE_PENDING, true);
+	WARN_RATELIMIT(ret, "IRQ %d", host_irq);
+
+	return ret;
+}
+
+/* Must be called with the dist->sdev_list_lock held */
+struct shadow_dev *kvm_shadow_dev_get(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct shadow_dev *sdev;
+
+	if (!sdev_enable)
+		return NULL;
+
+	list_for_each_entry(sdev, &dist->sdev_list_head, entry) {
+		if (sdev->devid != msi->devid)
+			continue;
+
+		if (sdev->nvecs <= msi->data ||
+		    !test_bit(msi->data, sdev->enable))
+			break;
+
+		return sdev;
+	}
+
+	return NULL;
+}
+
+static struct platform_device *sdev_virt_pdev_add(u32 nvec)
+{
+	struct platform_device *virtdev;
+	int ret = -ENOMEM;
+
+	virtdev = platform_device_alloc("virt_plat_dev", PLATFORM_DEVID_AUTO);
+	if (!virtdev) {
+		kvm_err("Allocate virtual platform device failed\n");
+		goto out;
+	}
+
+	dev_set_drvdata(&virtdev->dev, &nvec);
+
+	ret = platform_device_add(virtdev);
+	if (ret) {
+		kvm_err("Add virtual platform device failed (%d)\n", ret);
+		goto put_device;
+	}
+
+	return virtdev;
+
+put_device:
+	platform_device_put(virtdev);
+out:
+	return ERR_PTR(ret);
+}
+
+static void sdev_set_irq_entry(struct shadow_dev *sdev,
+			       struct kvm_kernel_irq_routing_entry *irq_entries)
+{
+	int i;
+
+	for (i = 0; i < sdev->nvecs; i++) {
+		irq_entries[i].msi.address_lo = sdev->msi[i].address_lo;
+		irq_entries[i].msi.address_hi = sdev->msi[i].address_hi;
+		irq_entries[i].msi.data = sdev->msi[i].data;
+		irq_entries[i].msi.flags = sdev->msi[i].flags;
+		irq_entries[i].msi.devid = sdev->msi[i].devid;
+	}
+}
+
+static int sdev_virq_bypass_active(struct kvm *kvm, struct shadow_dev *sdev)
+{
+	struct kvm_kernel_irq_routing_entry *irq_entries;
+	struct msi_desc *desc;
+	u32 vec = 0;
+
+	sdev->host_irq = kcalloc(sdev->nvecs, sizeof(int), GFP_KERNEL);
+	sdev->enable   = bitmap_zalloc(sdev->nvecs, GFP_KERNEL);
+	irq_entries    = kcalloc(sdev->nvecs,
+				 sizeof(struct kvm_kernel_irq_routing_entry),
+				 GFP_KERNEL);
+
+	if (!irq_entries || !sdev->enable || !sdev->host_irq) {
+		kfree(sdev->host_irq);
+		kfree(sdev->enable);
+		kfree(irq_entries);
+		return -ENOMEM;
+	}
+
+	sdev_set_irq_entry(sdev, irq_entries);
+
+	msi_for_each_desc(desc, &sdev->pdev->dev, MSI_DESC_ALL) {
+		if (!kvm_vgic_v4_set_forwarding(kvm, desc->irq,
+						&irq_entries[vec])) {
+			set_bit(vec, sdev->enable);
+			sdev->host_irq[vec] = desc->irq;
+		} else {
+			/*
+			 * Can not use shadow device for direct injection,
+			 * though not fatal...
+			 */
+			kvm_err("Shadow device set (%d) forwarding failed",
+				desc->irq);
+		}
+		vec++;
+	}
+
+	kfree(irq_entries);
+	return 0;
+}
+
+static void sdev_msi_entry_init(struct kvm_master_dev_info *mdi,
+				struct shadow_dev *sdev)
+{
+	int i;
+
+	for (i = 0; i < sdev->nvecs; i++) {
+		sdev->msi[i].address_lo = mdi->msi[i].address_lo;
+		sdev->msi[i].address_hi = mdi->msi[i].address_hi;
+		sdev->msi[i].data = mdi->msi[i].data;
+		sdev->msi[i].flags = mdi->msi[i].flags;
+		sdev->msi[i].devid = mdi->msi[i].devid;
+	}
+}
+
+int kvm_shadow_dev_create(struct kvm *kvm, struct kvm_master_dev_info *mdi)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct shadow_dev *sdev;
+	struct kvm_msi *msi;
+	unsigned long flags;
+	int ret;
+
+	if (WARN_ON(!sdev_enable))
+		return -EINVAL;
+
+	ret = -ENOMEM;
+	sdev = kzalloc(sizeof(struct shadow_dev), GFP_KERNEL);
+	if (!sdev)
+		return ret;
+
+	sdev->nvecs = mdi->nvectors;
+
+	msi = kcalloc(sdev->nvecs, sizeof(struct kvm_msi), GFP_KERNEL);
+	if (!msi)
+		goto free_sdev;
+
+	sdev->msi = msi;
+	sdev_msi_entry_init(mdi, sdev);
+	sdev->devid = sdev->msi[0].devid;
+
+	sdev->pdev = sdev_virt_pdev_add(sdev->nvecs);
+	if (IS_ERR(sdev->pdev)) {
+		ret = PTR_ERR(sdev->pdev);
+		goto free_sdev_msi;
+	}
+
+	ret = sdev_virq_bypass_active(kvm, sdev);
+	if (ret)
+		goto delete_virtdev;
+
+	sdev->kvm = kvm;
+	INIT_WORK(&sdev->destroy, shadow_dev_destroy);
+
+	raw_spin_lock_irqsave(&dist->sdev_list_lock, flags);
+	list_add_tail(&sdev->entry, &dist->sdev_list_head);
+	raw_spin_unlock_irqrestore(&dist->sdev_list_lock, flags);
+
+	kvm_info("Create shadow device: 0x%x\n", sdev->devid);
+	return ret;
+
+delete_virtdev:
+	sdev_virt_pdev_delete(sdev->pdev);
+free_sdev_msi:
+	kfree(sdev->msi);
+free_sdev:
+	kfree(sdev);
+	return ret;
+}
+
+static void sdev_virt_pdev_delete(struct platform_device *pdev)
+{
+	platform_device_unregister(pdev);
+}
+
+static void sdev_virq_bypass_deactive(struct kvm *kvm, struct shadow_dev *sdev)
+{
+	struct kvm_kernel_irq_routing_entry *irq_entries;
+	struct msi_desc *desc;
+	u32 vec = 0;
+
+	irq_entries = kcalloc(sdev->nvecs,
+			      sizeof(struct kvm_kernel_irq_routing_entry),
+			      GFP_KERNEL);
+	if (!irq_entries)
+		return;
+
+	sdev_set_irq_entry(sdev, irq_entries);
+
+	msi_for_each_desc(desc, &sdev->pdev->dev, MSI_DESC_ALL) {
+		if (!kvm_vgic_v4_unset_forwarding(kvm, desc->irq,
+						  &irq_entries[vec])) {
+			clear_bit(vec, sdev->enable);
+			sdev->host_irq[vec] = 0;
+		} else {
+			kvm_err("Shadow device unset (%d) forwarding failed",
+				desc->irq);
+		}
+		vec++;
+	}
+
+	kfree(sdev->host_irq);
+	kfree(sdev->enable);
+	kfree(irq_entries);
+
+	/* FIXME: no error handling */
+}
+
+static void shadow_dev_destroy(struct work_struct *work)
+{
+	struct shadow_dev *sdev = container_of(work, struct shadow_dev, destroy);
+	struct kvm *kvm = sdev->kvm;
+
+	sdev_virq_bypass_deactive(kvm, sdev);
+	sdev_virt_pdev_delete(sdev->pdev);
+
+	sdev->nvecs = 0;
+	kfree(sdev->msi);
+	kfree(sdev);
+}
+
+void kvm_shadow_dev_delete(struct kvm *kvm, u32 devid)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct shadow_dev *sdev, *tmp;
+	unsigned long flags;
+
+	if (WARN_ON(!sdev_enable))
+		return;
+
+	raw_spin_lock_irqsave(&dist->sdev_list_lock, flags);
+	WARN_ON(list_empty(&dist->sdev_list_head)); /* shouldn't be invoked */
+
+	list_for_each_entry_safe(sdev, tmp, &dist->sdev_list_head, entry) {
+		if (sdev->devid != devid)
+			continue;
+
+		list_del(&sdev->entry);
+		queue_work(sdev_cleanup_wq, &sdev->destroy);
+		break;
+	}
+	raw_spin_unlock_irqrestore(&dist->sdev_list_lock, flags);
+
+	flush_workqueue(sdev_cleanup_wq);
+}
+
+void kvm_shadow_dev_delete_all(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct shadow_dev *sdev, *tmp;
+	unsigned long flags;
+
+	if (!sdev_enable)
+		return;
+
+	raw_spin_lock_irqsave(&dist->sdev_list_lock, flags);
+
+	list_for_each_entry_safe(sdev, tmp, &dist->sdev_list_head, entry) {
+		list_del(&sdev->entry);
+		queue_work(sdev_cleanup_wq, &sdev->destroy);
+	}
+
+	raw_spin_unlock_irqrestore(&dist->sdev_list_lock, flags);
+
+	flush_workqueue(sdev_cleanup_wq);
+}
+
+static int __init early_virt_msi_bypass(char *buf)
+{
+	return strtobool(buf, &virt_msi_bypass);
+}
+early_param("kvm-arm.virt_msi_bypass", early_virt_msi_bypass);
+
+void kvm_shadow_dev_init(void)
+{
+	/*
+	 * FIXME: Ideally shadow device should only rely on a GICv4.0
+	 * capable ITS, but we should also take the reserved device ID
+	 * pools into account.
+	 */
+	sdev_enable = kvm_vgic_global_state.has_gicv4 && virt_msi_bypass;
+
+	sdev_cleanup_wq = alloc_workqueue("kvm-sdev-cleanup", 0, 0);
+	if (!sdev_cleanup_wq)
+		sdev_enable = false;
+
+	kvm_info("Shadow device %sabled\n", sdev_enable ? "en" : "dis");
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index cffb3030c524..35699f10790a 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -56,6 +56,10 @@ void kvm_vgic_early_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&dist->lpi_list_head);
 	INIT_LIST_HEAD(&dist->lpi_translation_cache);
 	raw_spin_lock_init(&dist->lpi_list_lock);
+#ifdef CONFIG_VIRT_PLAT_DEV
+	INIT_LIST_HEAD(&dist->sdev_list_head);
+	raw_spin_lock_init(&dist->sdev_list_lock);
+#endif
 }
 
 /* CREATION */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 5b27f94d4fad..0e6726a5c707 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -35,6 +35,23 @@
 #define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
 			 (irq) <= VGIC_MAX_SPI)
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+struct shadow_dev {
+	struct kvm              *kvm;
+	struct list_head        entry;
+
+	u32                     devid;  /* guest visible device id */
+	u32                     nvecs;
+	unsigned long           *enable;
+	int                     *host_irq;
+	struct kvm_msi          *msi;
+
+	struct platform_device  *pdev;
+
+	struct work_struct      destroy;
+};
+#endif
+
 enum vgic_type {
 	VGIC_V2,		/* Good ol' GICv2 */
 	VGIC_V3,		/* New fancy GICv3 */
@@ -292,6 +309,10 @@ struct vgic_dist {
 	 * else.
 	 */
 	struct its_vm		its_vm;
+#ifdef CONFIG_VIRT_PLAT_DEV
+	raw_spinlock_t		sdev_list_lock;
+	struct list_head	sdev_list_head;
+#endif
 };
 
 struct vgic_v2_cpu_if {
@@ -437,4 +458,13 @@ int vgic_v4_put(struct kvm_vcpu *vcpu);
 void kvm_vgic_cpu_up(void);
 void kvm_vgic_cpu_down(void);
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+extern bool sdev_enable;
+
+void kvm_shadow_dev_init(void);
+int kvm_shadow_dev_create(struct kvm *kvm, struct kvm_master_dev_info *mdi);
+void kvm_shadow_dev_delete(struct kvm *kvm, u32 devid);
+void kvm_shadow_dev_delete_all(struct kvm *kvm);
+#endif
+
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b4206f7b66f2..73d02597a99f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1229,6 +1229,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_COUNTER_OFFSET 227
 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_ARM_VIRT_MSI_BYPASS 799
 #define KVM_CAP_USER_MEMORY2 231
 #define KVM_CAP_MEMORY_FAULT_INFO 232
 #define KVM_CAP_MEMORY_ATTRIBUTES 233
@@ -1497,6 +1498,11 @@ struct kvm_vfio_spapr_tce {
 	__s32	tablefd;
 };
 
+struct kvm_master_dev_info {
+	__u32 nvectors;
+	struct kvm_msi msi[];
+};
+
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -1614,6 +1620,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
 
+#define KVM_CREATE_SHADOW_DEV	  _IOW(KVMIO,  0xf0, struct kvm_master_dev_info)
+#define KVM_DEL_SHADOW_DEV	  _IOW(KVMIO,  0xf1, __u32)
+
 /*
  * ioctls for vcpu fds
  */
-- 
Gitee


From 2db292705a7b8f7654935790e347aa05243e2dfc Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:27 +0800
Subject: [PATCH 16/59] KVM: arm64: kire: irq routing entry cached the relevant
 cache data

commit e25e2fd494003f0fe9cc8b5e0be3e74148c45a79 openEuler

For each kernel irq routing entry, cached the relevant virtual
shadow device, for IRQ bypass inject in future.

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/vgic/vgic-irqfd.c | 23 +++++++++++++++++++++++
 include/kvm/arm_vgic.h           |  1 +
 include/linux/kvm_host.h         | 16 ++++++++++++++++
 virt/kvm/eventfd.c               | 11 +++++++++++
 4 files changed, 51 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index 475059bacedf..381b700c7141 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -9,6 +9,29 @@
 #include <kvm/arm_vgic.h>
 #include "vgic.h"
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm_msi *msi);
+
+void kire_arch_cached_data_update(struct kvm *kvm,
+			struct kvm_kernel_irq_routing_entry *e)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kire_data *cache = &e->cache;
+	struct shadow_dev *sdev;
+	struct kvm_msi msi;
+
+	kvm_populate_msi(e, &msi);
+
+	raw_spin_lock(&dist->sdev_list_lock);
+	sdev = kvm_shadow_dev_get(kvm, &msi);
+	raw_spin_unlock(&dist->sdev_list_lock);
+
+	cache->valid = !!sdev;
+	cache->data = sdev;
+}
+#endif
+
 /**
  * vgic_irqfd_set_irq: inject the IRQ corresponding to the
  * irqchip routing entry
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 0e6726a5c707..9c20fea176f1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -465,6 +465,7 @@ void kvm_shadow_dev_init(void);
 int kvm_shadow_dev_create(struct kvm *kvm, struct kvm_master_dev_info *mdi);
 void kvm_shadow_dev_delete(struct kvm *kvm, u32 devid);
 void kvm_shadow_dev_delete_all(struct kvm *kvm);
+struct shadow_dev *kvm_shadow_dev_get(struct kvm *kvm, struct kvm_msi *msi);
 #endif
 
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c19e08bc0fc0..2da89425cca1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -651,6 +651,13 @@ struct kvm_xen_evtchn {
 	u32 priority;
 };
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+struct kire_data {
+	bool    valid;
+	void    *data;
+};
+#endif
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -674,6 +681,10 @@ struct kvm_kernel_irq_routing_entry {
 		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	struct kire_data cache;
+#endif
 };
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -1708,6 +1719,11 @@ int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+void kire_arch_cached_data_update(struct kvm *kvm,
+				  struct kvm_kernel_irq_routing_entry *e);
+#endif
+
 /*
  * Returns a pointer to the memslot if it contains gfn.
  * Otherwise returns NULL.
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 89912a17f5d5..a7c2667209d3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -38,6 +38,14 @@ kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
 	return true;
 }
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+void __weak
+kire_arch_cached_data_update(struct kvm *kvm,
+			     struct kvm_kernel_irq_routing_entry *e)
+{
+}
+#endif
+
 static void
 irqfd_inject(struct work_struct *work)
 {
@@ -270,6 +278,9 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 	else
 		irqfd->irq_entry.type = 0;
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+	kire_arch_cached_data_update(kvm, &irqfd->irq_entry);
+#endif
 	write_seqcount_end(&irqfd->irq_entry_sc);
 }
 
-- 
Gitee


From 633cffe3c237267cd1f0fda659e33689f95707df Mon Sep 17 00:00:00 2001
From: wanghaibin <wanghaibin.wang@huawei.com>
Date: Thu, 11 Jan 2024 20:43:28 +0800
Subject: [PATCH 17/59] KVM: arm64: sdev: Support virq bypass by INT/VSYNC
 command

commit 3a95fc619fa462fca60b502ed87cad889bebd22b openEuler

We already have the set_irqchip_state() callback which will issue
INT/VSYNC commands on the physical side when the handled IRQ is
forwarded to vcpu.  It was intended to be used to handle the guest
INT command targeting a VLPI.

Let's reuse this hack to set the virtio-pci function's VLPI pending,
instead of directly writing message data into GITS_TRANSLATER, which
should only be treated as the Message Address of PCI devices.

Signed-off-by: wanghaibin <wanghaibin.wang@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Dongxu Sun <sundongxu3@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/vgic/vgic-irqfd.c | 21 +++++++++++++++++++++
 include/kvm/arm_vgic.h           |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index 381b700c7141..f7de55ae55be 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -121,6 +121,23 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	return vgic_its_inject_msi(kvm, &msi);
 }
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+static int kvm_arch_set_irq_bypass(struct kvm_kernel_irq_routing_entry *e,
+				  struct kvm *kvm)
+{
+	struct kire_data *cache = &e->cache;
+
+	/*
+	 * FIXME: is there any race against the irqfd_update(),
+	 * where the cache data will be updated?
+	 */
+	if (!cache->valid)
+		return -EWOULDBLOCK;
+
+	return shadow_dev_virq_bypass_inject(kvm, e);
+}
+#endif
+
 /**
  * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
  */
@@ -138,6 +155,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		if (!vgic_has_its(kvm))
 			break;
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+		if (!kvm_arch_set_irq_bypass(e, kvm))
+			return 0;
+#endif
 		kvm_populate_msi(e, &msi);
 		return vgic_its_inject_cached_translation(kvm, &msi);
 	}
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 9c20fea176f1..f804210f9596 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -466,6 +466,9 @@ int kvm_shadow_dev_create(struct kvm *kvm, struct kvm_master_dev_info *mdi);
 void kvm_shadow_dev_delete(struct kvm *kvm, u32 devid);
 void kvm_shadow_dev_delete_all(struct kvm *kvm);
 struct shadow_dev *kvm_shadow_dev_get(struct kvm *kvm, struct kvm_msi *msi);
+
+int shadow_dev_virq_bypass_inject(struct kvm *kvm,
+				  struct kvm_kernel_irq_routing_entry *e);
 #endif
 
 #endif /* __KVM_ARM_VGIC_H */
-- 
Gitee


From c1292f5bc4c113a73af167b93b15b6768edb5fc3 Mon Sep 17 00:00:00 2001
From: Jia Qingtong <jiaqingtong@huawei.com>
Date: Thu, 2 Jan 2025 17:25:55 +0800
Subject: [PATCH 18/59] acpi/iort: Add func to get used deviceid bitmap

commit f234395f459d55fc974ff1f4241c379d9d7d6b32 openEuler

Build DeviceID used bitmap from ACPI IORT table.

Signed-off-by: Jia Qingtong <jiaqingtong@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/acpi/arm64/iort.c        | 84 ++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v3-its.c | 29 ++++++++++-
 include/linux/acpi_iort.h        |  1 +
 3 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 6ee8382a3230..b9f3cced9c18 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -180,6 +180,90 @@ int iort_register_domain_token(int trans_id, phys_addr_t base,
 	return 0;
 }
 
+static bool iort_mapd_to_its(struct acpi_iort_node *node)
+{
+	return node->type == ACPI_IORT_NODE_ITS_GROUP;
+}
+
+bool iort_gen_used_DeviceID_bitmap(unsigned long *bus_bm, resource_size_t len)
+{
+	struct acpi_iort_node *iort_node, *iort_end, *parent;
+	struct acpi_table_iort *iort;
+	int i, j, k, start_range, end_range_included;
+	struct acpi_iort_id_mapping *map;
+
+	if (!iort_table) {
+		pr_err(FW_BUG "iort_table hasn't inited.\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * iort_table and iort both point to the start of IORT table, but
+	 * have different struct types
+	 */
+	iort = (struct acpi_table_iort *)iort_table;
+	iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				 iort->node_offset);
+	iort_end = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				iort_table->length);
+
+	for (i = 0; i < iort->node_count; i++) {
+		if (WARN_TAINT(iort_node >= iort_end, TAINT_FIRMWARE_WORKAROUND,
+			       "IORT node pointer overflows, bad table!\n"))
+			return -EINVAL;
+
+		/* find all map that go to ITS, so skip ITS node and no mapping node. */
+		if (iort_node->type == ACPI_IORT_NODE_ITS_GROUP ||
+			!iort_node->mapping_count ||
+			!iort_node->mapping_offset) {
+			pr_info(FW_INFO "[node %p type %d] don't map to its, skip it.\n",
+					iort_node, iort_node->type);
+			goto next_loop;
+		}
+
+		map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, iort_node,
+						iort_node->mapping_offset);
+
+		for (j = 0; j < iort_node->mapping_count; j++, map++) {
+			/* Firmware bug! */
+			if (!map->output_reference) {
+				pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference, skip it.\n",
+						iort_node, iort_node->type);
+				goto next_loop;
+			}
+
+			parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
+								map->output_reference);
+			if (!iort_mapd_to_its(parent)) {
+				pr_debug(FW_INFO "[node %p type %d map %p] isn't map to its, skip it.\n",
+						iort_node, iort_node->type, map);
+				goto next_loop;
+			}
+			/*
+			 * What we want to get is some free DeviceID,
+			 * and [output_base, output_base + id_count) is that we find.
+			 * We will manage the DeviceID usage using bitmap. Since manage
+			 * that range one by one using bitmap is a litte awaste of memory,
+			 * let's use two level bitmap, and we can reuse PCIe's BUS concept
+			 * to build our two level bitmap.
+			 * Note that THIS HAS NOTHING TO DO WITH PCIe.
+			 */
+			start_range = PCI_BUS_NUM(map->output_base);
+			end_range_included = PCI_BUS_NUM(map->output_base + map->id_count);
+			pr_debug(FW_INFO "[node %p type %d map %p] add [%x,%x]->[%x,%x]\n",
+					iort_node, iort_node->type, map, map->output_base,
+					map->output_base + map->id_count, start_range,
+					end_range_included);
+			for (k = start_range; k != end_range_included + 1; ++k)
+				set_bit(k, bus_bm);
+		}
+next_loop:
+		iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort_node, iort_node->length);
+	}
+
+	return 0;
+}
+
 /**
  * iort_deregister_domain_token() - Deregister domain token based on ITS ID
  * @trans_id: ITS ID.
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 03ce9ad2f1d9..bf1c5e8daeb1 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -49,6 +49,34 @@
 #ifdef CONFIG_VIRT_PLAT_DEV
 #include <linux/pci.h>
 
+static int iort_get_used_bus_bitmap(unsigned long **bus_bm, resource_size_t *len)
+{
+	resource_size_t idx;
+	/* PCIe bus has 8 bits */
+	const size_t BUS_MAX_NUM = 0x100;
+
+	if (bus_bm == NULL || len == NULL)
+		return -EINVAL;
+
+	*bus_bm = bitmap_zalloc(BUS_MAX_NUM, GFP_KERNEL);
+	if (*bus_bm == NULL)
+		return -ENOMEM;
+
+	*len = BUS_MAX_NUM;
+
+	if (iort_gen_used_DeviceID_bitmap(*bus_bm, *len)) {
+		bitmap_free(*bus_bm);
+		return -EINVAL;
+	}
+
+	pr_debug("generated bus bitmap :");
+	for (idx = 0; idx != BUS_MAX_NUM; ++idx)
+		pr_debug("idx[%llx] %x", idx, test_bit(idx, *bus_bm));
+
+	return 0;
+}
+#endif
+
 /* a reserved bus id region */
 struct plat_rsv_buses {
 	u8	start;	/* the first reserved bus id */
@@ -142,7 +170,6 @@ static int probe_devid_pool_one(void)
 
 	return 0;
 }
-#endif
 
 #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING		(1ULL << 0)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_22375	(1ULL << 1)
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 1cb65592c95d..a5e397fe05a8 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -43,6 +43,7 @@ int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
 void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
+bool iort_gen_used_DeviceID_bitmap(unsigned long *bus_bm, resource_size_t len);
 #else
 static inline u32 iort_msi_map_id(struct device *dev, u32 id)
 { return id; }
-- 
Gitee


From f540150a91f44beb47f11e1647b050c5c652239e Mon Sep 17 00:00:00 2001
From: Jia Qingtong <jiaqingtong@huawei.com>
Date: Thu, 2 Jan 2025 17:25:56 +0800
Subject: [PATCH 19/59] irqchip/gic-v3-its: Add ACPI_IORT as VIRT_PLAT_DEV's
 dependency

commit 8bc36d370007811cdabdcb19f509e0d6bf70dec5 openEuler

Make VIRT_PLAT_DEV dependent on ACPI_IORT, as VIRT_PLAT_DEV will
use DeviceID used info generated from ACPI IORT table.

Signed-off-by: Jia Qingtong <jiaqingtong@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/misc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 452e8ea8bfd7..b5184ce9d86e 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -504,7 +504,7 @@ config HISI_HIKEY_USB
 
 config VIRT_PLAT_DEV
 	bool "virt platform device driver"
-        depends on KVM && ARM64 && ARCH_HISI
+        depends on KVM && ARM64 && ARCH_HISI && ACPI_IORT
         default n
         help
           Enable this configuration option to probe the virtual platform device,
-- 
Gitee


From 71061596ce201fa6487628cecf817627dd07d063 Mon Sep 17 00:00:00 2001
From: Jia Qingtong <jiaqingtong@huawei.com>
Date: Thu, 2 Jan 2025 17:25:57 +0800
Subject: [PATCH 20/59] irqchip/gic-v3-its: Move build_devid_pools from its to
 acpi iort init

commit af552f9d957def499065ed6e194bcd5f0c75dbc5 openEuler

Move build_devid_pools' call to iort init as  VIRT_PLAT_DEV depedent
on ACPI IORT table's DeviceID info.

Signed-off-by: Jia Qingtong <jiaqingtong@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/acpi/arm64/iort.c        |  8 ++++++++
 drivers/irqchip/irq-gic-v3-its.c | 21 +++++++++------------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index b9f3cced9c18..5da3732722be 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -2049,6 +2049,10 @@ static void __init iort_init_platform_devices(void)
 	}
 }
 
+#ifdef CONFIG_VIRT_PLAT_DEV
+void build_devid_pools(void);
+#endif
+
 void __init acpi_iort_init(void)
 {
 	acpi_status status;
@@ -2069,6 +2073,10 @@ void __init acpi_iort_init(void)
 	}
 
 	iort_init_platform_devices();
+
+#ifdef CONFIG_VIRT_PLAT_DEV
+	build_devid_pools();
+#endif
 }
 
 #ifdef CONFIG_ZONE_DMA
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index bf1c5e8daeb1..d2d3ef529166 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -101,8 +101,8 @@ struct rsv_devid_pool {
 static LIST_HEAD(rsv_devid_pools);
 static DEFINE_RAW_SPINLOCK(rsv_devid_pools_lock);
 
-/* Do we have usable rsv_devid_pool? Initialized to be true. */
-bool rsv_devid_pool_cap = true;
+/* Do we have usable rsv_devid_pool? Initialized to be false. */
+bool rsv_devid_pool_cap;
 static u8 rsv_buses_start, rsv_buses_count;
 
 static int __init rsv_buses_start_cfg(char *buf)
@@ -399,15 +399,19 @@ static int alloc_devid_from_rsv_pools(struct rsv_devid_pool **devid_pool,
 /*
  * Currently we only build *one* devid pool.
  */
-static int build_devid_pools(void)
+void build_devid_pools(void)
 {
 	struct its_node *its;
 
 	its = list_first_entry(&its_nodes, struct its_node, entry);
 	if (readl_relaxed(its->base + GITS_IIDR) != 0x00051736)
-		return -EINVAL;
+		return;
+
+	if (!probe_devid_pool_one())
+		rsv_devid_pool_cap = true;
 
-	return probe_devid_pool_one();
+	if (rsv_devid_pool_cap)
+		pr_info("ITS: reserved device id pools enabled\n");
 }
 #endif
 
@@ -6156,13 +6160,6 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 			pr_err("ITS: Disabling GICv4 support\n");
 		}
 
-#ifdef CONFIG_VIRT_PLAT_DEV
-		if (build_devid_pools())
-			rsv_devid_pool_cap = false;
-
-		if (rsv_devid_pool_cap)
-			pr_info("ITS: reserved device id pools enabled\n");
-#endif
 	}
 
 	register_syscore_ops(&its_syscore_ops);
-- 
Gitee


From 5065dc39468fe52b471ec119b9e8d2cbaf1c2bae Mon Sep 17 00:00:00 2001
From: Jia Qingtong <jiaqingtong@huawei.com>
Date: Thu, 2 Jan 2025 17:25:58 +0800
Subject: [PATCH 21/59] irqchip/gic-v3-its: Init reserved rsv_devid_pools use
 pci bus info

commit 1572bef21a3ef247a6b0989fc167d15b45954f47 openEuler

Auto probe free DeviceID using IORT table info other than using
the cmdline opt.

Signed-off-by: Jia Qingtong <jiaqingtong@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/irqchip/irq-gic-v3-its.c | 105 ++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 42 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index d2d3ef529166..319c1da8aae8 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -103,54 +103,19 @@ static DEFINE_RAW_SPINLOCK(rsv_devid_pools_lock);
 
 /* Do we have usable rsv_devid_pool? Initialized to be false. */
 bool rsv_devid_pool_cap;
-static u8 rsv_buses_start, rsv_buses_count;
 
-static int __init rsv_buses_start_cfg(char *buf)
-{
-	return kstrtou8(buf, 0, &rsv_buses_start);
-}
-early_param("irqchip.gicv3_rsv_buses_start", rsv_buses_start_cfg);
-
-static int __init rsv_buses_count_cfg(char *buf)
-{
-	return kstrtou8(buf, 0, &rsv_buses_count);
-}
-early_param("irqchip.gicv3_rsv_buses_count", rsv_buses_count_cfg);
-
-static void get_rsv_buses_resource(struct plat_rsv_buses *buses)
-{
-	buses->start = rsv_buses_start;
-	buses->count = rsv_buses_count;
-
-	/*
-	 * FIXME: There is no architectural way to get the *correct*
-	 * reserved bus id info.
-	 *
-	 * The first thought is to increase the GITS_TYPER.Devbits for
-	 * the usage for virtualization, but this will break all
-	 * command layouts with DeviceID as an argument (e.g., INT).
-	 *
-	 * The second way is to decrease the GITS_TYPER.Devids so that
-	 * SW can pick the unused device IDs for use (these IDs should
-	 * actually be supported at HW level, though not exposed).
-	 * *Or* fetch the information with the help of firmware. They
-	 * are essentially the same way.
-	 */
-}
-
-static int probe_devid_pool_one(void)
+#ifdef CONFIG_VIRT_PLAT_DEV
+static int add_bus_range_to_pool(resource_size_t start_bus, resource_size_t end_bus)
 {
 	struct rsv_devid_pool *devid_pool;
 
 	devid_pool = kzalloc(sizeof(*devid_pool), GFP_KERNEL);
 	if (!devid_pool)
 		return -ENOMEM;
-
-	get_rsv_buses_resource(&devid_pool->buses);
 	raw_spin_lock_init(&devid_pool->devid_bm_lock);
 
-	devid_pool->start = PCI_DEVID(devid_pool->buses.start, 0);
-	devid_pool->end = PCI_DEVID(devid_pool->buses.start + devid_pool->buses.count, 0);
+	devid_pool->start = PCI_DEVID(start_bus, 0);
+	devid_pool->end = PCI_DEVID(end_bus, 0);
 
 	if (devid_pool->end == devid_pool->start) {
 		kfree(devid_pool);
@@ -158,19 +123,75 @@ static int probe_devid_pool_one(void)
 	}
 
 	devid_pool->devid_bm = bitmap_zalloc(devid_pool->end - devid_pool->start,
-					     GFP_KERNEL);
+					    GFP_KERNEL);
 	if (!devid_pool->devid_bm) {
 		kfree(devid_pool);
 		return -ENOMEM;
 	}
 
-	raw_spin_lock(&rsv_devid_pools_lock);
+	/* here we need'nt get the rsv_devid_pools_lock. only the consumer needs. */
 	list_add(&devid_pool->entry, &rsv_devid_pools);
-	raw_spin_unlock(&rsv_devid_pools_lock);
 
+	pr_debug("ITS: add [%x-%x] to bus pool\n", devid_pool->start, devid_pool->end);
 	return 0;
 }
 
+static int probe_devid_pool_one(void)
+{
+	resource_size_t idx, begin_idx, end_idx, bm_len;
+	unsigned long *devid_bm;
+	bool found_begin = false, found_end = false;
+
+	if (iort_get_used_bus_bitmap(&devid_bm, &bm_len))
+		return -EINVAL;
+
+	for (idx = 0; idx != bm_len; ++idx) {
+		bool cur_bit_set = test_bit(idx, devid_bm);
+
+		if (!cur_bit_set && found_begin == false) {
+			/* found the empty bits begin */
+			begin_idx = idx;
+			found_begin = true;
+
+			/* for the case that first zero is last bit */
+			if (idx == bm_len - 1) {
+				/* found the empts bits end */
+				end_idx = bm_len;
+				found_end = true;
+			} else {
+				/* let's find the end */
+				continue;
+			}
+		} else if (cur_bit_set && found_begin == true) {
+			/* found the empts bits end */
+			end_idx = idx;
+			found_end = true;
+		} else if (idx == bm_len - 1 && found_begin == true) {
+			/* found the empts bits end */
+			end_idx = bm_len;
+			found_end = true;
+		} else {
+			/* nothing special found, all zero or all one, skip */
+			continue;
+		}
+
+		/* here we found the begin & end, let's build a pool and add to pool list */
+		add_bus_range_to_pool(begin_idx, end_idx);
+		found_begin = found_end = false;
+	}
+	bitmap_free(devid_bm);
+	/* here we need'nt get the rsv_devid_pools_lock. only the consumer needs. */
+	if (list_empty(&rsv_devid_pools))
+		return -EINVAL;
+	return 0;
+}
+#else
+static int probe_devid_pool_one(void)
+{
+	return -EINVAL;
+}
+#endif
+
 #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING		(1ULL << 0)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_22375	(1ULL << 1)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_23144	(1ULL << 2)
-- 
Gitee


From d307ea2d18f5caf4d6a99850df93a8e4d2039ab5 Mon Sep 17 00:00:00 2001
From: Jia Qingtong <jiaqingtong@huawei.com>
Date: Wed, 23 Jul 2025 11:06:56 +0800
Subject: [PATCH 22/59] irqchip/gic-v3-its: Remove DevID Pool's restriction

commit 2dfa84d06b8accefaae722c5d7a6aee29d69cd19 openEuler

DevID Pool only relay on ARM's IORT, so delete the chip restriction.

Signed-off-by: Jia Qingtong <jiaqingtong@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 drivers/irqchip/irq-gic-v3-its.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 319c1da8aae8..294c853beb88 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -422,12 +422,6 @@ static int alloc_devid_from_rsv_pools(struct rsv_devid_pool **devid_pool,
  */
 void build_devid_pools(void)
 {
-	struct its_node *its;
-
-	its = list_first_entry(&its_nodes, struct its_node, entry);
-	if (readl_relaxed(its->base + GITS_IIDR) != 0x00051736)
-		return;
-
 	if (!probe_devid_pool_one())
 		rsv_devid_pool_cap = true;
 
-- 
Gitee


From 56a60dd91cf5e1ff08ab8cc36d63c33dd8db6ae3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:01 +0100
Subject: [PATCH 23/59] KVM: arm64: vgic: Make kvm_vgic_inject_irq() take a
 vcpu pointer

commit 9a0a75d3ccee20149587ab740a2dee31ba401ada upstream

Passing a vcpu_id to kvm_vgic_inject_irq() is silly for two reasons:

- we often confuse vcpu_id and vcpu_idx
- we eventually have to convert it back to a vcpu
- we can't count

Instead, pass a vcpu pointer, which is unambiguous. A NULL vcpu
is also allowed for interrupts that are not private to a vcpu
(such as SPIs).

Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-2-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/arch_timer.c      |  2 +-
 arch/arm64/kvm/arm.c             | 23 ++++++++---------------
 arch/arm64/kvm/pmu-emul.c        |  2 +-
 arch/arm64/kvm/vgic/vgic-irqfd.c |  2 +-
 arch/arm64/kvm/vgic/vgic.c       | 12 +++++-------
 include/kvm/arm_vgic.h           |  4 ++--
 6 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 94fe11382343..df9d03da4b1c 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -469,7 +469,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 				   timer_ctx->irq.level);
 
 	if (!userspace_irqchip(vcpu->kvm)) {
-		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu,
 					  timer_irq(timer_ctx),
 					  timer_ctx->irq.level,
 					  timer_ctx);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index afc7869a6c12..27344ae89d73 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1309,27 +1309,23 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 			  bool line_status)
 {
 	u32 irq = irq_level->irq;
-	unsigned int irq_type, vcpu_idx, irq_num;
-	int nrcpus = atomic_read(&kvm->online_vcpus);
+	unsigned int irq_type, vcpu_id, irq_num;
 	struct kvm_vcpu *vcpu = NULL;
 	bool level = irq_level->level;
 
 	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
-	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
-	vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
+	vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
+	vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
 	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
 
-	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
+	trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
 
 	switch (irq_type) {
 	case KVM_ARM_IRQ_TYPE_CPU:
 		if (irqchip_in_kernel(kvm))
 			return -ENXIO;
 
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 		if (!vcpu)
 			return -EINVAL;
 
@@ -1341,17 +1337,14 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 		if (!irqchip_in_kernel(kvm))
 			return -ENXIO;
 
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 		if (!vcpu)
 			return -EINVAL;
 
 		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
 			return -EINVAL;
 
-		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
+		return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
 	case KVM_ARM_IRQ_TYPE_SPI:
 		if (!irqchip_in_kernel(kvm))
 			return -ENXIO;
@@ -1359,7 +1352,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 		if (irq_num < VGIC_NR_PRIVATE_IRQS)
 			return -EINVAL;
 
-		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
+		return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
 	}
 
 	return -EINVAL;
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index af2d1f0a55d3..ad766cc0e192 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -366,7 +366,7 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
 	pmu->irq_level = overflow;
 
 	if (likely(irqchip_in_kernel(vcpu->kvm))) {
-		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu,
 					      pmu->irq_num, overflow, pmu);
 		WARN_ON(ret);
 	}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
index f7de55ae55be..25f7a5dda02a 100644
--- a/arch/arm64/kvm/vgic/vgic-irqfd.c
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -46,7 +46,7 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
 
 	if (!vgic_valid_spi(kvm, spi_id))
 		return -EINVAL;
-	return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
+	return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL);
 }
 
 /**
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 07d73faacc51..0650a5b57ed9 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -431,7 +431,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 /**
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
+ * @vcpu:    The CPU for PPIs or NULL for global interrupts
  * @intid:   The INTID to inject a new state to.
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *			      false: to ignore the call
@@ -445,24 +445,22 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
  * level-sensitive interrupts.  You can think of the level parameter as 1
  * being HIGH and 0 being LOW and all devices being active-HIGH.
  */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level, void *owner)
+int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+			unsigned int intid, bool level, void *owner)
 {
-	struct kvm_vcpu *vcpu;
 	struct vgic_irq *irq;
 	unsigned long flags;
 	int ret;
 
-	trace_vgic_update_irq_pending(cpuid, intid, level);
-
 	ret = vgic_lazy_init(kvm);
 	if (ret)
 		return ret;
 
-	vcpu = kvm_get_vcpu(kvm, cpuid);
 	if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
 		return -EINVAL;
 
+	trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level);
+
 	irq = vgic_get_irq(kvm, vcpu, intid);
 	if (!irq)
 		return -EINVAL;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f804210f9596..acb521b6bfca 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -396,8 +396,8 @@ int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_hyp_init(void);
 void kvm_vgic_init_cpu_hardware(void);
 
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level, void *owner);
+int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+			unsigned int intid, bool level, void *owner);
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
 			  u32 vintid, struct irq_ops *ops);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid);
-- 
Gitee


From 97e7d6173ee71e15c23fa0acf76b66163f985971 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:02 +0100
Subject: [PATCH 24/59] KVM: arm64: vgic-its: Treat the collection target
 address as a vcpu_id

commit d455d366c451e68781122c693e2e357c673ee807 upstream

Since our emulated ITS advertises GITS_TYPER.PTA=0, the target
address associated to a collection is a PE number and not
an address. So far, so good. However, the PE number is what userspace
has provided given us (aka the vcpu_id), and not the internal vcpu
index.

Make sure we consistently retrieve the vcpu by ID rather than
by index, adding a helper that deals with most of the cases.

We also get rid of the pointless (and bogus) comparisons to
online_vcpus, which don't really make sense.

Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-3-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/vgic/vgic-its.c | 49 +++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 608c39859f4e..dc43b51cc373 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -378,6 +378,12 @@ static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm,
+					   struct its_collection *col)
+{
+	return kvm_get_vcpu_by_id(kvm, col->target_addr);
+}
+
 /*
  * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
  * is targeting) to the VGIC's view, which deals with target VCPUs.
@@ -391,7 +397,7 @@ static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
 	if (!its_is_collection_mapped(ite->collection))
 		return;
 
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+	vcpu = collection_to_vcpu(kvm, ite->collection);
 	update_affinity(ite->irq, vcpu);
 }
 
@@ -686,7 +692,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
 	if (!ite || !its_is_collection_mapped(ite->collection))
 		return E_ITS_INT_UNMAPPED_INTERRUPT;
 
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+	vcpu = collection_to_vcpu(kvm, ite->collection);
 	if (!vcpu)
 		return E_ITS_INT_UNMAPPED_INTERRUPT;
 
@@ -899,7 +905,7 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
 		return E_ITS_MOVI_UNMAPPED_COLLECTION;
 
 	ite->collection = collection;
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+	vcpu = collection_to_vcpu(kvm, collection);
 
 	vgic_its_invalidate_cache(kvm);
 
@@ -1133,7 +1139,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	}
 
 	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+		vcpu = collection_to_vcpu(kvm, collection);
 
 	irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
 	if (IS_ERR(irq)) {
@@ -1256,21 +1262,22 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
 				    u64 *its_cmd)
 {
 	u16 coll_id;
-	u32 target_addr;
 	struct its_collection *collection;
 	bool valid;
 
 	valid = its_cmd_get_validbit(its_cmd);
 	coll_id = its_cmd_get_collection(its_cmd);
-	target_addr = its_cmd_get_target_addr(its_cmd);
-
-	if (target_addr >= atomic_read(&kvm->online_vcpus))
-		return E_ITS_MAPC_PROCNUM_OOR;
 
 	if (!valid) {
 		vgic_its_free_collection(its, coll_id);
 		vgic_its_invalidate_cache(kvm);
 	} else {
+		struct kvm_vcpu *vcpu;
+
+		vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
+		if (!vcpu)
+			return E_ITS_MAPC_PROCNUM_OOR;
+
 		collection = find_collection(its, coll_id);
 
 		if (!collection) {
@@ -1284,9 +1291,9 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
 							coll_id);
 			if (ret)
 				return ret;
-			collection->target_addr = target_addr;
+			collection->target_addr = vcpu->vcpu_id;
 		} else {
-			collection->target_addr = target_addr;
+			collection->target_addr = vcpu->vcpu_id;
 			update_affinity_collection(kvm, its, collection);
 		}
 	}
@@ -1396,7 +1403,7 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
 	if (!its_is_collection_mapped(collection))
 		return E_ITS_INVALL_UNMAPPED_COLLECTION;
 
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+	vcpu = collection_to_vcpu(kvm, collection);
 	vgic_its_invall(vcpu);
 
 	return 0;
@@ -1413,23 +1420,21 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
 static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
 				      u64 *its_cmd)
 {
-	u32 target1_addr = its_cmd_get_target_addr(its_cmd);
-	u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
 	struct kvm_vcpu *vcpu1, *vcpu2;
 	struct vgic_irq *irq;
 	u32 *intids;
 	int irq_count, i;
 
-	if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
-	    target2_addr >= atomic_read(&kvm->online_vcpus))
+	/* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */
+	vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
+	vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32));
+
+	if (!vcpu1 || !vcpu2)
 		return E_ITS_MOVALL_PROCNUM_OOR;
 
-	if (target1_addr == target2_addr)
+	if (vcpu1 == vcpu2)
 		return 0;
 
-	vcpu1 = kvm_get_vcpu(kvm, target1_addr);
-	vcpu2 = kvm_get_vcpu(kvm, target2_addr);
-
 	irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
 	if (irq_count < 0)
 		return irq_count;
@@ -2274,7 +2279,7 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
 		return PTR_ERR(ite);
 
 	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+		vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr);
 
 	irq = vgic_add_lpi(kvm, lpi_id, vcpu);
 	if (IS_ERR(irq)) {
@@ -2589,7 +2594,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
 	coll_id = val & KVM_ITS_CTE_ICID_MASK;
 
 	if (target_addr != COLLECTION_NOT_MAPPED &&
-	    target_addr >= atomic_read(&kvm->online_vcpus))
+	    !kvm_get_vcpu_by_id(kvm, target_addr))
 		return -EINVAL;
 
 	collection = find_collection(its, coll_id);
-- 
Gitee


From 975d2b313a2fed396d62f107315ccdb955b9fc68 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:03 +0100
Subject: [PATCH 25/59] KVM: arm64: vgic-v3: Refactor GICv3 SGI generation

commit f3f60a56539108dacfd87b6514398b3970daec54 upstream

As we're about to change the way SGIs are sent, start by splitting
out some of the basic functionnality: instead of intermingling
the broadcast and non-broadcast cases with the actual SGI generation,
perform the following cleanups:

- move the SGI queuing into its own helper
- split the broadcast code from the affinity-driven code
- replace the mask/shift combinations with FIELD_GET()
- fix the confusion between vcpu_id and vcpu when handling
  the broadcast case

The result is much more readable, and paves the way for further
optimisations.

Tested-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-4-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 110 ++++++++++++++++-------------
 1 file changed, 59 insertions(+), 51 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 7c0b23415ad9..31e0b5aa2678 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -1075,6 +1075,38 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
 	((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
 	>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
 
+static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1)
+{
+	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, sgi);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+	/*
+	 * An access targeting Group0 SGIs can only generate
+	 * those, while an access targeting Group1 SGIs can
+	 * generate interrupts of either group.
+	 */
+	if (!irq->group || allow_group1) {
+		if (!irq->hw) {
+			irq->pending_latch = true;
+			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		} else {
+			/* HW SGI? Ask the GIC to inject it */
+			int err;
+			err = irq_set_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    true);
+			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		}
+	} else {
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+	}
+
+	vgic_put_irq(vcpu->kvm, irq);
+}
+
 /**
  * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
  * @vcpu: The VCPU requesting a SGI
@@ -1093,19 +1125,30 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_vcpu *c_vcpu;
-	u16 target_cpus;
+	unsigned long target_cpus;
 	u64 mpidr;
-	int sgi;
-	int vcpu_id = vcpu->vcpu_id;
-	bool broadcast;
-	unsigned long c, flags;
-
-	sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-	broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-	target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+	u32 sgi;
+	unsigned long c;
+
+	sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg);
+
+	/* Broadcast */
+	if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) {
+		kvm_for_each_vcpu(c, c_vcpu, kvm) {
+			/* Don't signal the calling VCPU */
+			if (c_vcpu == vcpu)
+				continue;
+
+			vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
+		}
+
+		return;
+	}
+
 	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
 	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
 	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+	target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg);
 
 	/*
 	 * We iterate over all VCPUs to find the MPIDRs matching the request.
@@ -1114,54 +1157,19 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
 	 * VCPUs when most of the times we just signal a single VCPU.
 	 */
 	kvm_for_each_vcpu(c, c_vcpu, kvm) {
-		struct vgic_irq *irq;
+		int level0;
 
 		/* Exit early if we have dealt with all requested CPUs */
-		if (!broadcast && target_cpus == 0)
+		if (target_cpus == 0)
 			break;
-
-		/* Don't signal the calling VCPU */
-		if (broadcast && c == vcpu_id)
+		level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+		if (level0 == -1)
 			continue;
 
-		if (!broadcast) {
-			int level0;
-
-			level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-			if (level0 == -1)
-				continue;
-
-			/* remove this matching VCPU from the mask */
-			target_cpus &= ~BIT(level0);
-		}
+		/* remove this matching VCPU from the mask */
+		target_cpus &= ~BIT(level0);
 
-		irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		/*
-		 * An access targeting Group0 SGIs can only generate
-		 * those, while an access targeting Group1 SGIs can
-		 * generate interrupts of either group.
-		 */
-		if (!irq->group || allow_group1) {
-			if (!irq->hw) {
-				irq->pending_latch = true;
-				vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-			} else {
-				/* HW SGI? Ask the GIC to inject it */
-				int err;
-				err = irq_set_irqchip_state(irq->host_irq,
-							    IRQCHIP_STATE_PENDING,
-							    true);
-				WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-				raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-			}
-		} else {
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
+		vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
 	}
 }
 
-- 
Gitee


From aca8dd2bc93cc8031df1262a0b4d8632c54efdae Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:05 +0100
Subject: [PATCH 26/59] KVM: arm64: vgic: Use vcpu_idx for the debug
 information

commit ac0fe56d46c0d534dddcf97d132c222722b9b7a5 upstream

When dumping the debug information, use vcpu_idx instead of vcpu_id,
as this is independent of any userspace influence.

Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-6-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/vgic/vgic-debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 07aa0437125a..85606a531dc3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -166,7 +166,7 @@ static void print_header(struct seq_file *s, struct vgic_irq *irq,
 
 	if (vcpu) {
 		hdr = "VCPU";
-		id = vcpu->vcpu_id;
+		id = vcpu->vcpu_idx;
 	}
 
 	seq_printf(s, "\n");
@@ -212,7 +212,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
 		      "     %2d "
 		      "\n",
 			type, irq->intid,
-			(irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
+			(irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1,
 			pending,
 			irq->line_level,
 			irq->active,
@@ -224,7 +224,7 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
 			irq->mpidr,
 			irq->source,
 			irq->priority,
-			(irq->vcpu) ? irq->vcpu->vcpu_id : -1);
+			(irq->vcpu) ? irq->vcpu->vcpu_idx : -1);
 }
 
 static int vgic_debug_show(struct seq_file *s, void *v)
-- 
Gitee


From a2aba1b02ec1233dada4b5809423a588b963ee08 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:06 +0100
Subject: [PATCH 27/59] KVM: arm64: Use vcpu_idx for invalidation tracking

commit 5f4bd815ec718992c09de1f39479ae90dcbb7df3 upstream

While vcpu_id isn't necessarily a bad choice as an identifier for
the currently running vcpu, it is provided by userspace, and there
is close to no guarantee that it would be unique.

Switch it to vcpu_idx instead, for which we have much stronger
guarantees.

Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-7-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/arm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 27344ae89d73..b832c637eb7a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -579,9 +579,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 * We might get preempted before the vCPU actually runs, but
 	 * over-invalidation doesn't affect correctness.
 	 */
-	if (*last_ran != vcpu->vcpu_id) {
+	if (*last_ran != vcpu->vcpu_idx) {
 		kvm_call_hyp(__kvm_flush_cpu_context, mmu);
-		*last_ran = vcpu->vcpu_id;
+		*last_ran = vcpu->vcpu_idx;
 	}
 
 	vcpu->cpu = cpu;
-- 
Gitee


From 4008f0de7be30f6ae872e2e3a7004afece18e41d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:07 +0100
Subject: [PATCH 28/59] KVM: arm64: Simplify kvm_vcpu_get_mpidr_aff()

commit 0a2acd38d23b8865b3a5a8c851001350df76fc09 upstream

By definition, MPIDR_EL1 cannot be modified by the guest. This
means it is pointless to check whether this is loaded on the CPU.

Simplify the kvm_vcpu_get_mpidr_aff() helper to directly access
the in-memory value.

Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Tested-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-8-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 7e94f8eebe09..67b1dae0d63a 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -486,7 +486,7 @@ static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
 
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 {
-	return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
+	return __vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
 }
 
 static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
-- 
Gitee


From a25de32475c55d046d0931d54ecdb0b17a54e616 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:08 +0100
Subject: [PATCH 29/59] KVM: arm64: Build MPIDR to vcpu index cache at runtime

commit 5544750efd51143fee9e9184d69518c0c05426a1 upstream

The MPIDR_EL1 register contains a unique value that identifies
the CPU. The only problem with it is that it is stupidly large
(32 bits, once the useless stuff is removed).

Trying to obtain a vcpu from an MPIDR value is a fairly common,
yet costly operation: we iterate over all the vcpus until we
find the correct one. While this is cheap for small VMs, it is
pretty expensive on large ones, specially if you are trying to
get to the one that's at the end of the list...

In order to help with this, it is important to realise that
the MPIDR values are actually structured, and that implementations
tend to use a small number of significant bits in the 32bit space.

We can use this fact to our advantage by computing a small hash
table that uses the "compression" of the significant MPIDR bits
as an index, giving us the vcpu index as a result.

Given that the MPIDR values can be supplied by userspace, and
that an evil VMM could decide to make *all* bits significant,
resulting in a 4G-entry table, we only use this method if the
resulting table fits in a single page. Otherwise, we fallback
to the good old iterative method.

Nothing uses that table just yet, but keep your eyes peeled.

Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Tested-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-9-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/include/asm/kvm_host.h | 28 ++++++++++++++++
 arch/arm64/kvm/arm.c              | 54 +++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index c01d74a70d34..ce9908fe0acf 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -202,6 +202,31 @@ struct kvm_protected_vm {
 	struct kvm_hyp_memcache teardown_mc;
 };
 
+struct kvm_mpidr_data {
+	u64			mpidr_mask;
+	DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
+};
+
+static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
+{
+	unsigned long mask = data->mpidr_mask;
+	u64 aff = mpidr & MPIDR_HWID_BITMASK;
+	int nbits, bit, bit_idx = 0;
+	u16 index = 0;
+
+	/*
+	 * If this looks like RISC-V's BEXT or x86's PEXT
+	 * instructions, it isn't by accident.
+	 */
+	nbits = fls(mask);
+	for_each_set_bit(bit, &mask, nbits) {
+		index |= (aff & BIT(bit)) >> (bit - bit_idx);
+		bit_idx++;
+	}
+
+	return index;
+}
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -248,6 +273,9 @@ struct kvm_arch {
 	/* VM-wide vCPU feature set */
 	DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
 
+	/* MPIDR to vcpu index mapping, optional */
+	struct kvm_mpidr_data *mpidr_data;
+
 	/*
 	 * VM-wide PMU filter, implemented as a bitmap and big enough for
 	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b832c637eb7a..c5e4358158c5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -329,6 +329,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (is_protected_kvm_enabled())
 		pkvm_destroy_hyp_vm(kvm);
 
+	kfree(kvm->arch.mpidr_data);
 	kvm_destroy_vcpus(kvm);
 
 	kvm_unshare_hyp(kvm, kvm + 1);
@@ -730,6 +731,57 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
 	return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
 }
 
+static void kvm_init_mpidr_data(struct kvm *kvm)
+{
+	struct kvm_mpidr_data *data = NULL;
+	unsigned long c, mask, nr_entries;
+	u64 aff_set = 0, aff_clr = ~0UL;
+	struct kvm_vcpu *vcpu;
+
+	mutex_lock(&kvm->arch.config_lock);
+
+	if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
+		goto out;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+		aff_set |= aff;
+		aff_clr &= aff;
+	}
+
+	/*
+	 * A significant bit can be either 0 or 1, and will only appear in
+	 * aff_set. Use aff_clr to weed out the useless stuff.
+	 */
+	mask = aff_set ^ aff_clr;
+	nr_entries = BIT_ULL(hweight_long(mask));
+
+	/*
+	 * Don't let userspace fool us. If we need more than a single page
+	 * to describe the compressed MPIDR array, just fall back to the
+	 * iterative method. Single vcpu VMs do not need this either.
+	 */
+	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
+		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
+			       GFP_KERNEL_ACCOUNT);
+
+	if (!data)
+		goto out;
+
+	data->mpidr_mask = mask;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
+		u16 index = kvm_mpidr_index(data, aff);
+
+		data->cmpidr_to_idx[index] = c;
+	}
+
+	kvm->arch.mpidr_data = data;
+out:
+	mutex_unlock(&kvm->arch.config_lock);
+}
+
 /*
  * Handle both the initialisation that is being done when the vcpu is
  * run for the first time, as well as the updates that must be
@@ -753,6 +805,8 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 	if (likely(vcpu_has_run_once(vcpu)))
 		return 0;
 
+	kvm_init_mpidr_data(kvm);
+
 	kvm_arm_vcpu_init_debug(vcpu);
 
 	if (likely(irqchip_in_kernel(kvm))) {
-- 
Gitee


From 146e5a1d936fe7411faa6bee163eec9a07ccf10e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:09 +0100
Subject: [PATCH 30/59] KVM: arm64: Fast-track kvm_mpidr_to_vcpu() when
 mpidr_data is available

commit 54a8006d0b49044d0cb682119686a45de906fe3c upstream

If our fancy little table is present when calling kvm_mpidr_to_vcpu(),
use it to recover the corresponding vcpu.

Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Tested-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-10-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/arm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c5e4358158c5..b58a7c46d993 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2661,6 +2661,18 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
 	unsigned long i;
 
 	mpidr &= MPIDR_HWID_BITMASK;
+
+	if (kvm->arch.mpidr_data) {
+		u16 idx = kvm_mpidr_index(kvm->arch.mpidr_data, mpidr);
+
+		vcpu = kvm_get_vcpu(kvm,
+				    kvm->arch.mpidr_data->cmpidr_to_idx[idx]);
+		if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
+			vcpu = NULL;
+
+		return vcpu;
+	}
+
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
 			return vcpu;
-- 
Gitee


From d50ca9a6dba69f332dcae156899ae5966427f950 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:10 +0100
Subject: [PATCH 31/59] KVM: arm64: vgic-v3: Optimize affinity-based SGI
 injection

commit b5daffb120bb60f974ae1a5589160b05c98e00e5 upstream

Our affinity-based SGI injection code is a bit daft. We iterate
over all the CPUs trying to match the set of affinities that the
guest is trying to reach, leading to some very bad behaviours
if the selected targets are at a high vcpu index.

Instead, we can now use the fact that we have an optimised
MPIDR to vcpu mapping, and only look at the relevant values.

This results in a much faster injection for large VMs, and
in a near constant time, irrespective of the position in the
vcpu index space.

As a bonus, this is mostly deleting a lot of hard-to-read
code. Nobody will complain about that.

Suggested-by: Xu Zhao <zhaoxu.35@bytedance.com>
Tested-by: Joey Gouly <joey.gouly@arm.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-11-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 64 +++++-------------------------
 1 file changed, 11 insertions(+), 53 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 31e0b5aa2678..cea9745fa2c5 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -1036,35 +1036,6 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
 
 	return 0;
 }
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-	unsigned long affinity;
-	int level0;
-
-	/*
-	 * Split the current VCPU's MPIDR into affinity level 0 and the
-	 * rest as this is what we have to compare against.
-	 */
-	affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-	level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-	affinity &= ~MPIDR_LEVEL_MASK;
-
-	/* bail out if the upper three levels don't match */
-	if (sgi_aff != affinity)
-		return -1;
-
-	/* Is this VCPU's bit set in the mask ? */
-	if (!(sgi_cpu_mask & BIT(level0)))
-		return -1;
-
-	return level0;
-}
 
 /*
  * The ICC_SGI* registers encode the affinity differently from the MPIDR,
@@ -1117,9 +1088,11 @@ static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1)
  * This will trap in sys_regs.c and call this function.
  * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
  * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
+ *
+ * If the interrupt routing mode bit is not set, we iterate over the Aff0
+ * bits and signal the VCPUs matching the provided Aff{3,2,1}.
+ *
+ * If this bit is set, we signal all, but not the calling VCPU.
  */
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
 {
@@ -1127,7 +1100,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
 	struct kvm_vcpu *c_vcpu;
 	unsigned long target_cpus;
 	u64 mpidr;
-	u32 sgi;
+	u32 sgi, aff0;
 	unsigned long c;
 
 	sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg);
@@ -1145,31 +1118,16 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
 		return;
 	}
 
+	/* We iterate over affinities to find the corresponding vcpus */
 	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
 	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
 	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
 	target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg);
 
-	/*
-	 * We iterate over all VCPUs to find the MPIDRs matching the request.
-	 * If we have handled one CPU, we clear its bit to detect early
-	 * if we are already finished. This avoids iterating through all
-	 * VCPUs when most of the times we just signal a single VCPU.
-	 */
-	kvm_for_each_vcpu(c, c_vcpu, kvm) {
-		int level0;
-
-		/* Exit early if we have dealt with all requested CPUs */
-		if (target_cpus == 0)
-			break;
-		level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-		if (level0 == -1)
-			continue;
-
-		/* remove this matching VCPU from the mask */
-		target_cpus &= ~BIT(level0);
-
-		vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
+	for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) {
+		c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0);
+		if (c_vcpu)
+			vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
 	}
 }
 
-- 
Gitee


From 79609d786a8dbeb048f80d0358adff53f82f86e4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Sep 2023 10:09:11 +0100
Subject: [PATCH 32/59] KVM: arm64: Clarify the ordering requirements for
 vcpu/RD creation

commit f9940416f193d148dd7ad7498ce4f40c83d12e7a upstream

It goes without saying, but it is probably better to spell it out:

If userspace tries to restore and VM, but creates vcpus and/or RDs
in a different order, the vcpu/RD mapping will be different. Yes,
our API is an ugly piece of crap and I can't believe that we missed
this.

If we want to relax the above, we'll need to define a new userspace
API that allows the mapping to be specified, rather than relying
on the kernel to perform the mapping on its own.

Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230927090911.3355209-12-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Jun He <jun.he@arm.com>
---
 Documentation/virt/kvm/devices/arm-vgic-v3.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
index 51e5e5762571..5817edb4e046 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -59,6 +59,13 @@ Groups:
   It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and
   KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes.
 
+  Note that to obtain reproducible results (the same VCPU being associated
+  with the same redistributor across a save/restore operation), VCPU creation
+  order, redistributor region creation order as well as the respective
+  interleaves of VCPU and region creation MUST be preserved.  Any change in
+  either ordering may result in a different vcpu_id/redistributor association,
+  resulting in a VM that will fail to run at restore time.
+
   Errors:
 
     =======  =============================================================
-- 
Gitee


From 701de0a6244c9b87b8306446842f9a40a99684f5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:01 +0100
Subject: [PATCH 33/59] crypto: arm64/crct10dif - Remove obsolete chunking
 logic

commit 7048c21e6b50e4dec0de1ed48b12db50b94b3f57 upstream

This is a partial revert of commit fc754c024a343b, which moved the logic
into C code which ensures that kernel mode NEON code does not hog the
CPU for too long.

This is no longer needed now that kernel mode NEON no longer disables
preemption, so we can drop this.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/crypto/crct10dif-ce-glue.c | 30 ++++++---------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 09eb1456aed4..ccc3f6067742 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -37,18 +37,9 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 	u16 *crc = shash_desc_ctx(desc);
 
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull_p8(*crc, data, length);
+		kernel_neon_end();
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
 	}
@@ -62,18 +53,9 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
 	u16 *crc = shash_desc_ctx(desc);
 
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull_p64(*crc, data, length);
+		kernel_neon_end();
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
 	}
-- 
Gitee


From a642dc4f0893efd363184b42c9675d7140e76ec1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:02 +0100
Subject: [PATCH 34/59] crypto: arm64/crct10dif - Use faster 16x64 bit
 polynomial multiply

commit 67dfb1b73f423622a0096ea43fb1f5b7336f49e0 upstream

The CRC-T10DIF implementation for arm64 has a version that uses 8x8
polynomial multiplication, for cores that lack the crypto extensions,
which cover the 64x64 polynomial multiplication instruction that the
algorithm was built around.

This fallback version rather naively adopted the 64x64 polynomial
multiplication algorithm that I ported from ARM for the GHASH driver,
which needs 8 PMULL8 instructions to implement one PMULL64. This is
reasonable, given that each 8-bit vector element needs to be multiplied
with each element in the other vector, producing 8 vectors with partial
results that need to be combined to yield the correct result.

However, most PMULL64 invocations in the CRC-T10DIF code involve
multiplication by a pair of 16-bit folding coefficients, and so all the
partial results from higher order bytes will be zero, and there is no
need to calculate them to begin with.

Then, the CRC-T10DIF algorithm always XORs the output values of the
PMULL64 instructions being issued in pairs, and so there is no need to
faithfully implement each individual PMULL64 instruction, as long as
XORing the results pairwise produces the expected result.

Implementing these improvements results in a speedup of 3.3x on low-end
platforms such as Raspberry Pi 4 (Cortex-A72)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/crypto/crct10dif-ce-core.S | 121 ++++++++++++++++++++++----
 1 file changed, 104 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 5604de61d06d..d2acaa2b5a01 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -1,8 +1,11 @@
 //
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
@@ -122,6 +125,13 @@
 	sli		perm2.2d, perm1.2d, #56
 	sli		perm3.2d, perm1.2d, #48
 	sli		perm4.2d, perm1.2d, #40
+
+	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+	movi		bd1.4h, #8, lsl #8
+	orr		bd1.2s, #1, lsl #16
+	orr		bd1.2s, #1, lsl #24
+	zip1		bd1.16b, bd1.16b, bd1.16b
+	zip1		bd1.16b, bd1.16b, bd1.16b
 	.endm
 
 	.macro		__pmull_pre_p8, bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
 	ret
 SYM_FUNC_END(__pmull_p8_core)
 
+	.macro		pmull16x64_p64, a16, b64, c64
+	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
+	pmull		\b64\().1q, \a16\().1d, \b64\().1d
+	.endm
+
+	/*
+	 * Pairwise long polynomial multiplication of two 16-bit values
+	 *
+	 *   { w0, w1 }, { y0, y1 }
+	 *
+	 * by two 64-bit values
+	 *
+	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+	 *
+	 * where each vector element is a byte, ordered from least to most
+	 * significant.
+	 *
+	 * This can be implemented using 8x8 long polynomial multiplication, by
+	 * reorganizing the input so that each pairwise 8x8 multiplication
+	 * produces one of the terms from the decomposition below, and
+	 * combining the results of each rank and shifting them into place.
+	 *
+	 * Rank
+	 *  0            w0*x0 ^              |        y0*z0 ^
+	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+	 *  8            w1*x7      << 64     |        y1*z7      << 64
+	 *
+	 * The inputs can be reorganized into
+	 *
+	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+	 *
+	 * and after performing 8x8->16 bit long polynomial multiplication of
+	 * each of the halves of the first vector with those of the second one,
+	 * we obtain the following four vectors of 16-bit elements:
+	 *
+	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+	 *
+	 * Results b and c can be XORed together, as the vector elements have
+	 * matching ranks. Then, the final XOR (*) can be pulled forward, and
+	 * applied between the halves of each of the remaining three vectors,
+	 * which are then shifted into place, and combined to produce two
+	 * 80-bit results.
+	 *
+	 * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+	 * to the 64x64 bit one above, but XOR'ing the outputs together will
+	 * produce the expected result, and this is sufficient in the context of
+	 * this algorithm.
+	 */
+	.macro		pmull16x64_p8, a16, b64, c64
+	ext		t7.16b, \b64\().16b, \b64\().16b, #1
+	tbl		t5.16b, {\a16\().16b}, bd1.16b
+	uzp1		t7.16b, \b64\().16b, t7.16b
+	bl		__pmull_p8_16x64
+	ext		\b64\().16b, t4.16b, t4.16b, #15
+	eor		\c64\().16b, t8.16b, t5.16b
+	.endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+	ext		t6.16b, t5.16b, t5.16b, #8
+
+	pmull		t3.8h, t7.8b, t5.8b
+	pmull		t4.8h, t7.8b, t6.8b
+	pmull2		t5.8h, t7.16b, t5.16b
+	pmull2		t6.8h, t7.16b, t6.16b
+
+	ext		t8.16b, t3.16b, t3.16b, #8
+	eor		t4.16b, t4.16b, t6.16b
+	ext		t7.16b, t5.16b, t5.16b, #8
+	ext		t6.16b, t4.16b, t4.16b, #8
+	eor		t8.8b, t8.8b, t3.8b
+	eor		t5.8b, t5.8b, t7.8b
+	eor		t4.8b, t4.8b, t6.8b
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
 	.macro		__pmull_p8, rq, ad, bd, i
 	.ifnc		\bd, fold_consts
 	.err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
 	.macro		fold_32_bytes, p, reg1, reg2
 	ldp		q11, q12, [buf], #0x20
 
-	__pmull_\p	v8, \reg1, fold_consts, 2
-	__pmull_\p	\reg1, \reg1, fold_consts
+	pmull16x64_\p	fold_consts, \reg1, v8
 
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 
-	__pmull_\p	v9, \reg2, fold_consts, 2
-	__pmull_\p	\reg2, \reg2, fold_consts
+	pmull16x64_\p	fold_consts, \reg2, v9
 
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
@@ -238,11 +332,9 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 
 	// Fold src_reg into dst_reg, optionally loading the next fold constants
 	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-	__pmull_\p	v8, \src_reg, fold_consts
-	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
+	pmull16x64_\p	fold_consts, \src_reg, v8
 	.ifnb		\load_next_consts
 	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
 	.endif
 	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
 	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
 	// Load the constants for folding across 128 bytes.
 	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
 
 	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
 	// 128 to simplify the termination condition of the following loop.
@@ -318,7 +409,6 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	// Fold across 64 bytes.
 	add		fold_consts_ptr, fold_consts_ptr, #16
 	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
 	fold_16_bytes	\p, v0, v4
 	fold_16_bytes	\p, v1, v5
 	fold_16_bytes	\p, v2, v6
@@ -339,8 +429,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	// into them, storing the result back into v7.
 	b.lt		.Lfold_16_bytes_loop_done_\@
 .Lfold_16_bytes_loop_\@:
-	__pmull_\p	v8, v7, fold_consts
-	__pmull_\p	v7, v7, fold_consts, 2
+	pmull16x64_\p	fold_consts, v7, v8
 	eor		v7.16b, v7.16b, v8.16b
 	ldr		q0, [buf], #16
 CPU_LE(	rev64		v0.16b, v0.16b			)
@@ -387,9 +476,8 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	bsl		v2.16b, v1.16b, v0.16b
 
 	// Fold the first chunk into the second chunk, storing the result in v7.
-	__pmull_\p	v0, v3, fold_consts
-	__pmull_\p	v7, v3, fold_consts, 2
-	eor		v7.16b, v7.16b, v0.16b
+	pmull16x64_\p	fold_consts, v3, v0
+	eor		v7.16b, v3.16b, v0.16b
 	eor		v7.16b, v7.16b, v2.16b
 
 .Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
 	// Load the fold-across-16-bytes constants.
 	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
 
 	cmp		len, #16
 	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
-- 
Gitee


From 54cf4e8eb157d061844201bbb400cd5c5c0c22e3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:03 +0100
Subject: [PATCH 35/59] crypto: arm64/crct10dif - Remove remaining 64x64 PMULL
 fallback code

commit 779cee8209c67aae195a81c3a72bac9e127fdaee upstream

The only remaining user of the fallback implementation of 64x64
polynomial multiplication using 8x8 PMULL instructions is the final
reduction from a 16 byte vector to a 16-bit CRC.

The fallback code is complicated and messy, and this reduction has
little impact on the overall performance, so instead, let's calculate
the final CRC by passing the 16 byte vector to the generic CRC-T10DIF
implementation when running the fallback version.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/crypto/crct10dif-ce-core.S | 244 ++++++--------------------
 arch/arm64/crypto/crct10dif-ce-glue.c |  18 +-
 2 files changed, 68 insertions(+), 194 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index d2acaa2b5a01..87dd6d46224d 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -74,137 +74,18 @@
 	init_crc	.req	w0
 	buf		.req	x1
 	len		.req	x2
-	fold_consts_ptr	.req	x3
+	fold_consts_ptr	.req	x5
 
 	fold_consts	.req	v10
 
-	ad		.req	v14
-
-	k00_16		.req	v15
-	k32_48		.req	v16
-
 	t3		.req	v17
 	t4		.req	v18
 	t5		.req	v19
 	t6		.req	v20
 	t7		.req	v21
 	t8		.req	v22
-	t9		.req	v23
-
-	perm1		.req	v24
-	perm2		.req	v25
-	perm3		.req	v26
-	perm4		.req	v27
-
-	bd1		.req	v28
-	bd2		.req	v29
-	bd3		.req	v30
-	bd4		.req	v31
-
-	.macro		__pmull_init_p64
-	.endm
 
-	.macro		__pmull_pre_p64, bd
-	.endm
-
-	.macro		__pmull_init_p8
-	// k00_16 := 0x0000000000000000_000000000000ffff
-	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
-	movi		k32_48.2d, #0xffffffff
-	mov		k32_48.h[2], k32_48.h[0]
-	ushr		k00_16.2d, k32_48.2d, #32
-
-	// prepare the permutation vectors
-	mov_q		x5, 0x080f0e0d0c0b0a09
-	movi		perm4.8b, #8
-	dup		perm1.2d, x5
-	eor		perm1.16b, perm1.16b, perm4.16b
-	ushr		perm2.2d, perm1.2d, #8
-	ushr		perm3.2d, perm1.2d, #16
-	ushr		perm4.2d, perm1.2d, #24
-	sli		perm2.2d, perm1.2d, #56
-	sli		perm3.2d, perm1.2d, #48
-	sli		perm4.2d, perm1.2d, #40
-
-	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-	movi		bd1.4h, #8, lsl #8
-	orr		bd1.2s, #1, lsl #16
-	orr		bd1.2s, #1, lsl #24
-	zip1		bd1.16b, bd1.16b, bd1.16b
-	zip1		bd1.16b, bd1.16b, bd1.16b
-	.endm
-
-	.macro		__pmull_pre_p8, bd
-	tbl		bd1.16b, {\bd\().16b}, perm1.16b
-	tbl		bd2.16b, {\bd\().16b}, perm2.16b
-	tbl		bd3.16b, {\bd\().16b}, perm3.16b
-	tbl		bd4.16b, {\bd\().16b}, perm4.16b
-	.endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
-	ext		t4.8b, ad.8b, ad.8b, #1			// A1
-	ext		t5.8b, ad.8b, ad.8b, #2			// A2
-	ext		t6.8b, ad.8b, ad.8b, #3			// A3
-
-	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
-	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
-	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
-	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
-	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
-	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
-	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
-	b		0f
-
-.L__pmull_p8_core2:
-	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
-	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
-	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
-
-	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
-	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
-	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
-	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
-	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
-	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
-	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
-
-0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
-	eor		t5.16b, t5.16b, t7.16b			// M = G + H
-	eor		t6.16b, t6.16b, t9.16b			// N = I + J
-
-	uzp1		t8.2d, t4.2d, t5.2d
-	uzp2		t4.2d, t4.2d, t5.2d
-	uzp1		t7.2d, t6.2d, t3.2d
-	uzp2		t6.2d, t6.2d, t3.2d
-
-	// t4 = (L) (P0 + P1) << 8
-	// t5 = (M) (P2 + P3) << 16
-	eor		t8.16b, t8.16b, t4.16b
-	and		t4.16b, t4.16b, k32_48.16b
-
-	// t6 = (N) (P4 + P5) << 24
-	// t7 = (K) (P6 + P7) << 32
-	eor		t7.16b, t7.16b, t6.16b
-	and		t6.16b, t6.16b, k00_16.16b
-
-	eor		t8.16b, t8.16b, t4.16b
-	eor		t7.16b, t7.16b, t6.16b
-
-	zip2		t5.2d, t8.2d, t4.2d
-	zip1		t4.2d, t8.2d, t4.2d
-	zip2		t3.2d, t7.2d, t6.2d
-	zip1		t6.2d, t7.2d, t6.2d
-
-	ext		t4.16b, t4.16b, t4.16b, #15
-	ext		t5.16b, t5.16b, t5.16b, #14
-	ext		t6.16b, t6.16b, t6.16b, #13
-	ext		t3.16b, t3.16b, t3.16b, #12
-
-	eor		t4.16b, t4.16b, t5.16b
-	eor		t6.16b, t6.16b, t3.16b
-	ret
-SYM_FUNC_END(__pmull_p8_core)
+	perm		.req	v27
 
 	.macro		pmull16x64_p64, a16, b64, c64
 	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
 	 */
 	.macro		pmull16x64_p8, a16, b64, c64
 	ext		t7.16b, \b64\().16b, \b64\().16b, #1
-	tbl		t5.16b, {\a16\().16b}, bd1.16b
+	tbl		t5.16b, {\a16\().16b}, perm.16b
 	uzp1		t7.16b, \b64\().16b, t7.16b
 	bl		__pmull_p8_16x64
 	ext		\b64\().16b, t4.16b, t4.16b, #15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
 	ret
 SYM_FUNC_END(__pmull_p8_16x64)
 
-	.macro		__pmull_p8, rq, ad, bd, i
-	.ifnc		\bd, fold_consts
-	.err
-	.endif
-	mov		ad.16b, \ad\().16b
-	.ifb		\i
-	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
-	.else
-	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
-	.endif
-
-	bl		.L__pmull_p8_core\i
-
-	eor		\rq\().16b, \rq\().16b, t4.16b
-	eor		\rq\().16b, \rq\().16b, t6.16b
-	.endm
 
 	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
 	// into reg1, reg2.
@@ -340,16 +205,7 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
 	.endm
 
-	.macro		__pmull_p64, rd, rn, rm, n
-	.ifb		\n
-	pmull		\rd\().1q, \rn\().1d, \rm\().1d
-	.else
-	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
-	.endif
-	.endm
-
 	.macro		crc_t10dif_pmull, p
-	__pmull_init_\p
 
 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
 	cmp		len, #256
@@ -479,47 +335,7 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	pmull16x64_\p	fold_consts, v3, v0
 	eor		v7.16b, v3.16b, v0.16b
 	eor		v7.16b, v7.16b, v2.16b
-
-.Lreduce_final_16_bytes_\@:
-	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-	movi		v2.16b, #0		// init zero register
-
-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-
-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	ext		v0.16b, v2.16b, v7.16b, #8
-	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
-	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
-
-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
-	mov		v0.s[3], v2.s[0]	// zero high 32 bits
-	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
-	eor		v0.16b, v0.16b, v1.16b	// + low bits
-
-	// Load G(x) and floor(x^48 / G(x)).
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
-
-	// Use Barrett reduction to compute the final CRC value.
-	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
-	ushr		v1.2d, v1.2d, #32	// /= x^32
-	__pmull_\p	v1, v1, fold_consts	// *= G(x)
-	ushr		v0.2d, v0.2d, #48
-	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-	umov		w0, v0.h[0]
-	.ifc		\p, p8
-	frame_pop
-	.endif
-	ret
+	b		.Lreduce_final_16_bytes_\@
 
 .Lless_than_256_bytes_\@:
 	// Checksumming a buffer of length 16...255 bytes
@@ -545,6 +361,8 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
 	add		len, len, #16
 	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
 	.endm
 
 //
@@ -554,7 +372,22 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 //
 SYM_FUNC_START(crc_t10dif_pmull_p8)
 	frame_push	1
+
+	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+	movi		perm.4h, #8, lsl #8
+	orr		perm.2s, #1, lsl #16
+	orr		perm.2s, #1, lsl #24
+	zip1		perm.16b, perm.16b, perm.16b
+	zip1		perm.16b, perm.16b, perm.16b
+
 	crc_t10dif_pmull p8
+
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	str		q7, [x3]
+
+	frame_pop
+	ret
 SYM_FUNC_END(crc_t10dif_pmull_p8)
 
 	.align		5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
 //
 SYM_FUNC_START(crc_t10dif_pmull_p64)
 	crc_t10dif_pmull	p64
+
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+	movi		v2.16b, #0		// init zero register
+
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
+
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]		// zero high 32 bits
+	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b		// + low bits
+
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+
+	// Use Barrett reduction to compute the final CRC value.
+	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32		// /= x^32
+	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
+	ret
 SYM_FUNC_END(crc_t10dif_pmull_p64)
 
 	.section	".rodata", "a"
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index ccc3f6067742..d9bcdd00c975 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -20,7 +20,8 @@
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+				    u8 out[16]);
 asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
 
 static int crct10dif_init(struct shash_desc *desc)
@@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
 static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
-	u16 *crc = shash_desc_ctx(desc);
+	u16 *crcp = shash_desc_ctx(desc);
+	u16 crc = *crcp;
+	u8 buf[16];
 
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+	if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
-		*crc = crc_t10dif_pmull_p8(*crc, data, length);
+		crc_t10dif_pmull_p8(crc, data, length, buf);
 		kernel_neon_end();
-	} else {
-		*crc = crc_t10dif_generic(*crc, data, length);
+
+		crc = 0;
+		data = buf;
+		length = sizeof(buf);
 	}
 
+	*crcp = crc_t10dif_generic(crc, data, length);
 	return 0;
 }
 
-- 
Gitee


From 6793ad8d9b2b97703a91e5bbd7ca2497bcb1999f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:04 +0100
Subject: [PATCH 36/59] crypto: arm/crct10dif - Use existing mov_l macro
 instead of __adrl

commit fcf27785ae51b259ea2a9b340f10f9d393954887 upstream

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm/crypto/crct10dif-ce-core.S | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 46c02c518a30..4dac32e020de 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -144,11 +144,6 @@ CPU_LE(	vrev64.8	q12, q12	)
 	veor.8		\dst_reg, \dst_reg, \src_reg
 	.endm
 
-	.macro		__adrl, out, sym
-	movw		\out, #:lower16:\sym
-	movt		\out, #:upper16:\sym
-	.endm
-
 //
 // u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
 //
@@ -160,7 +155,7 @@ ENTRY(crc_t10dif_pmull)
 	cmp		len, #256
 	blt		.Lless_than_256_bytes
 
-	__adrl		fold_consts_ptr, .Lfold_across_128_bytes_consts
+	mov_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
 
 	// Load the first 128 data bytes.  Byte swapping is necessary to make
 	// the bit order match the polynomial coefficient order.
@@ -262,7 +257,7 @@ CPU_LE(	vrev64.8	q0, q0	)
 	vswp		q0l, q0h
 
 	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
-	__adrl		r3, .Lbyteshift_table + 16
+	mov_l		r3, .Lbyteshift_table + 16
 	sub		r3, r3, len
 	vld1.8		{q2}, [r3]
 	vtbl.8		q1l, {q7l-q7h}, q2l
@@ -324,7 +319,7 @@ CPU_LE(	vrev64.8	q0, q0	)
 .Lless_than_256_bytes:
 	// Checksumming a buffer of length 16...255 bytes
 
-	__adrl		fold_consts_ptr, .Lfold_across_16_bytes_consts
+	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
 
 	// Load the first 16 data bytes.
 	vld1.64		{q7}, [buf]!
-- 
Gitee


From ae4eb668a483a77fb315aff0a878fe37c2c330d3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:05 +0100
Subject: [PATCH 37/59] crypto: arm/crct10dif - Macroify PMULL asm code

commit 802d8d110ce2b3ae979221551f4cb168e2f5e464 upstream

To allow an alternative version to be created of the PMULL based
CRC-T10DIF algorithm, turn the bulk of it into a macro, except for the
final reduction, which will only be used by the existing version.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm/crypto/crct10dif-ce-core.S | 154 ++++++++++++++--------------
 arch/arm/crypto/crct10dif-ce-glue.c |  10 +-
 2 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 4dac32e020de..6b72167574b2 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,48 +112,42 @@
 	FOLD_CONST_L	.req	q10l
 	FOLD_CONST_H	.req	q10h
 
+        .macro		pmull16x64_p64, v16, v64
+	vmull.p64	q11, \v64\()l, \v16\()_L
+	vmull.p64	\v64, \v64\()h, \v16\()_H
+	veor		\v64, \v64, q11
+	.endm
+
 	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
 	// into reg1, reg2.
-	.macro		fold_32_bytes, reg1, reg2
-	vld1.64		{q11-q12}, [buf]!
+	.macro		fold_32_bytes, reg1, reg2, p
+	vld1.64		{q8-q9}, [buf]!
 
-	vmull.p64	q8, \reg1\()h, FOLD_CONST_H
-	vmull.p64	\reg1, \reg1\()l, FOLD_CONST_L
-	vmull.p64	q9, \reg2\()h, FOLD_CONST_H
-	vmull.p64	\reg2, \reg2\()l, FOLD_CONST_L
+	pmull16x64_\p	FOLD_CONST, \reg1
+	pmull16x64_\p	FOLD_CONST, \reg2
 
-CPU_LE(	vrev64.8	q11, q11	)
-CPU_LE(	vrev64.8	q12, q12	)
-	vswp		q11l, q11h
-	vswp		q12l, q12h
+CPU_LE(	vrev64.8	q8, q8	)
+CPU_LE(	vrev64.8	q9, q9	)
+	vswp		q8l, q8h
+	vswp		q9l, q9h
 
 	veor.8		\reg1, \reg1, q8
 	veor.8		\reg2, \reg2, q9
-	veor.8		\reg1, \reg1, q11
-	veor.8		\reg2, \reg2, q12
 	.endm
 
 	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, src_reg, dst_reg, load_next_consts
-	vmull.p64	q8, \src_reg\()l, FOLD_CONST_L
-	vmull.p64	\src_reg, \src_reg\()h, FOLD_CONST_H
+	.macro		fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+	pmull16x64_\p	FOLD_CONST, \src_reg
 	.ifnb		\load_next_consts
 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
 	.endif
-	veor.8		\dst_reg, \dst_reg, q8
 	veor.8		\dst_reg, \dst_reg, \src_reg
 	.endm
 
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull)
-
+	.macro		crct10dif, p
 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
 	cmp		len, #256
-	blt		.Lless_than_256_bytes
+	blt		.Lless_than_256_bytes\@
 
 	mov_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
 
@@ -194,27 +188,27 @@ CPU_LE(	vrev64.8	q7, q7	)
 
 	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
 	// bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop:
-	fold_32_bytes	q0, q1
-	fold_32_bytes	q2, q3
-	fold_32_bytes	q4, q5
-	fold_32_bytes	q6, q7
+.Lfold_128_bytes_loop\@:
+	fold_32_bytes	q0, q1, \p
+	fold_32_bytes	q2, q3, \p
+	fold_32_bytes	q4, q5, \p
+	fold_32_bytes	q6, q7, \p
 	subs		len, len, #128
-	bge		.Lfold_128_bytes_loop
+	bge		.Lfold_128_bytes_loop\@
 
 	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
 
 	// Fold across 64 bytes.
 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-	fold_16_bytes	q0, q4
-	fold_16_bytes	q1, q5
-	fold_16_bytes	q2, q6
-	fold_16_bytes	q3, q7, 1
+	fold_16_bytes	q0, q4, \p
+	fold_16_bytes	q1, q5, \p
+	fold_16_bytes	q2, q6, \p
+	fold_16_bytes	q3, q7, \p, 1
 	// Fold across 32 bytes.
-	fold_16_bytes	q4, q6
-	fold_16_bytes	q5, q7, 1
+	fold_16_bytes	q4, q6, \p
+	fold_16_bytes	q5, q7, \p, 1
 	// Fold across 16 bytes.
-	fold_16_bytes	q6, q7
+	fold_16_bytes	q6, q7, \p
 
 	// Add 128 to get the correct number of data bytes remaining in 0...127
 	// (not counting q7), following the previous extra subtraction by 128.
@@ -224,25 +218,23 @@ CPU_LE(	vrev64.8	q7, q7	)
 
 	// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
 	// into them, storing the result back into q7.
-	blt		.Lfold_16_bytes_loop_done
-.Lfold_16_bytes_loop:
-	vmull.p64	q8, q7l, FOLD_CONST_L
-	vmull.p64	q7, q7h, FOLD_CONST_H
-	veor.8		q7, q7, q8
+	blt		.Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+	pmull16x64_\p	FOLD_CONST, q7
 	vld1.64		{q0}, [buf]!
 CPU_LE(	vrev64.8	q0, q0	)
 	vswp		q0l, q0h
 	veor.8		q7, q7, q0
 	subs		len, len, #16
-	bge		.Lfold_16_bytes_loop
+	bge		.Lfold_16_bytes_loop\@
 
-.Lfold_16_bytes_loop_done:
+.Lfold_16_bytes_loop_done\@:
 	// Add 16 to get the correct number of data bytes remaining in 0...15
 	// (not counting q7), following the previous extra subtraction by 16.
 	adds		len, len, #16
-	beq		.Lreduce_final_16_bytes
+	beq		.Lreduce_final_16_bytes\@
 
-.Lhandle_partial_segment:
+.Lhandle_partial_segment\@:
 	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
 	// 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
 	// do this without needing a fold constant for each possible 'len',
@@ -277,12 +269,46 @@ CPU_LE(	vrev64.8	q0, q0	)
 	vbsl.8		q2, q1, q0
 
 	// Fold the first chunk into the second chunk, storing the result in q7.
-	vmull.p64	q0, q3l, FOLD_CONST_L
-	vmull.p64	q7, q3h, FOLD_CONST_H
-	veor.8		q7, q7, q0
-	veor.8		q7, q7, q2
+	pmull16x64_\p	FOLD_CONST, q3
+	veor.8		q7, q3, q2
+	b		.Lreduce_final_16_bytes\@
+
+.Lless_than_256_bytes\@:
+	// Checksumming a buffer of length 16...255 bytes
+
+	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+	// Load the first 16 data bytes.
+	vld1.64		{q7}, [buf]!
+CPU_LE(	vrev64.8	q7, q7	)
+	vswp		q7l, q7h
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	vmov.i8		q0h, #0
+	vmov.u16	q0h[3], init_crc
+	veor.8		q7h, q7h, q0h
+
+	// Load the fold-across-16-bytes constants.
+	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+	cmp		len, #16
+	beq		.Lreduce_final_16_bytes\@	// len == 16
+	subs		len, len, #32
+	addlt		len, len, #16
+	blt		.Lhandle_partial_segment\@	// 17 <= len <= 31
+	b		.Lfold_16_bytes_loop\@		// 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+	.endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+	crct10dif	p64
 
-.Lreduce_final_16_bytes:
 	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
 
 	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@@ -316,31 +342,7 @@ CPU_LE(	vrev64.8	q0, q0	)
 	vmov.u16	r0, q0l[0]
 	bx		lr
 
-.Lless_than_256_bytes:
-	// Checksumming a buffer of length 16...255 bytes
-
-	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-	// Load the first 16 data bytes.
-	vld1.64		{q7}, [buf]!
-CPU_LE(	vrev64.8	q7, q7	)
-	vswp		q7l, q7h
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	vmov.i8		q0h, #0
-	vmov.u16	q0h[3], init_crc
-	veor.8		q7h, q7h, q0h
-
-	// Load the fold-across-16-bytes constants.
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
-	cmp		len, #16
-	beq		.Lreduce_final_16_bytes		// len == 16
-	subs		len, len, #32
-	addlt		len, len, #16
-	blt		.Lhandle_partial_segment	// 17 <= len <= 31
-	b		.Lfold_16_bytes_loop		// 32 <= len <= 255
-ENDPROC(crc_t10dif_pmull)
+ENDPROC(crc_t10dif_pmull64)
 
 	.section	".rodata", "a"
 	.align		4
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index e9191a8c87b9..c9b95c48a1cf 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -19,7 +19,7 @@
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
 
 static int crct10dif_init(struct shash_desc *desc)
 {
@@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int length)
+static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
+			       unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
 
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
-		*crc = crc_t10dif_pmull(*crc, data, length);
+		*crc = crc_t10dif_pmull64(*crc, data, length);
 		kernel_neon_end();
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
@@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
 static struct shash_alg crc_t10dif_alg = {
 	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
 	.init			= crct10dif_init,
-	.update			= crct10dif_update,
+	.update			= crct10dif_update_ce,
 	.final			= crct10dif_final,
 	.descsize		= CRC_T10DIF_DIGEST_SIZE,
 
-- 
Gitee


From c9485bfd6992403f46ce32699cb5bde4c3c11ced Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 17:09:06 +0100
Subject: [PATCH 38/59] crypto: arm/crct10dif - Implement plain NEON variant

commit e7c1d1c9b2023decb855ec4c921a7d78abbf64eb upstream

The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in
the folding coefficients, which are also only 16 bits wide.

This means that the polynomial multiplications involving these
coefficients can be performed using 8-bit long polynomial multiplication
(8x8 -> 16) in only a few steps, and this is an instruction that is part
of the base NEON ISA, which is all most real ARMv7 cores implement. (The
64-bit PMULL instruction is part of the crypto extensions, which are
only implemented by 64-bit cores)

The final reduction is a bit more involved, but we can delegate that to
the generic CRC-T10DIF implementation after folding the entire input
into a 16 byte vector.

This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit
mode. On Cortex-A8 (BeagleBone White), the results are substantially
better than that, but not sufficiently reproducible (with tcrypt) to
quote a number here.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm/crypto/crct10dif-ce-core.S | 98 +++++++++++++++++++++++++++--
 arch/arm/crypto/crct10dif-ce-glue.c | 45 +++++++++++--
 2 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 6b72167574b2..2bbf2df9c1e2 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,6 +112,82 @@
 	FOLD_CONST_L	.req	q10l
 	FOLD_CONST_H	.req	q10h
 
+	/*
+	 * Pairwise long polynomial multiplication of two 16-bit values
+	 *
+	 *   { w0, w1 }, { y0, y1 }
+	 *
+	 * by two 64-bit values
+	 *
+	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+	 *
+	 * where each vector element is a byte, ordered from least to most
+	 * significant. The resulting 80-bit vectors are XOR'ed together.
+	 *
+	 * This can be implemented using 8x8 long polynomial multiplication, by
+	 * reorganizing the input so that each pairwise 8x8 multiplication
+	 * produces one of the terms from the decomposition below, and
+	 * combining the results of each rank and shifting them into place.
+	 *
+	 * Rank
+	 *  0            w0*x0 ^              |        y0*z0 ^
+	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+	 *  8            w1*x7      << 64     |        y1*z7      << 64
+	 *
+	 * The inputs can be reorganized into
+	 *
+	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+	 *
+	 * and after performing 8x8->16 bit long polynomial multiplication of
+	 * each of the halves of the first vector with those of the second one,
+	 * we obtain the following four vectors of 16-bit elements:
+	 *
+	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+	 *
+	 * Results b and c can be XORed together, as the vector elements have
+	 * matching ranks. Then, the final XOR can be pulled forward, and
+	 * applied between the halves of each of the remaining three vectors,
+	 * which are then shifted into place, and XORed together to produce the
+	 * final 80-bit result.
+	 */
+        .macro		pmull16x64_p8, v16, v64
+	vext.8		q11, \v64, \v64, #1
+	vld1.64		{q12}, [r4, :128]
+	vuzp.8		q11, \v64
+	vtbl.8		d24, {\v16\()_L-\v16\()_H}, d24
+	vtbl.8		d25, {\v16\()_L-\v16\()_H}, d25
+	bl		__pmull16x64_p8
+	veor		\v64, q12, q14
+        .endm
+
+__pmull16x64_p8:
+	vmull.p8	q13, d23, d24
+	vmull.p8	q14, d23, d25
+	vmull.p8	q15, d22, d24
+	vmull.p8	q12, d22, d25
+
+	veor		q14, q14, q15
+	veor		d24, d24, d25
+	veor		d26, d26, d27
+	veor		d28, d28, d29
+	vmov.i32	d25, #0
+	vmov.i32	d29, #0
+	vext.8		q12, q12, q12, #14
+	vext.8		q14, q14, q14, #15
+	veor		d24, d24, d26
+	bx		lr
+ENDPROC(__pmull16x64_p8)
+
         .macro		pmull16x64_p64, v16, v64
 	vmull.p64	q11, \v64\()l, \v16\()_L
 	vmull.p64	\v64, \v64\()h, \v16\()_H
@@ -249,9 +325,9 @@ CPU_LE(	vrev64.8	q0, q0	)
 	vswp		q0l, q0h
 
 	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
-	mov_l		r3, .Lbyteshift_table + 16
-	sub		r3, r3, len
-	vld1.8		{q2}, [r3]
+	mov_l		r1, .Lbyteshift_table + 16
+	sub		r1, r1, len
+	vld1.8		{q2}, [r1]
 	vtbl.8		q1l, {q7l-q7h}, q2l
 	vtbl.8		q1h, {q7l-q7h}, q2h
 
@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
 
 	vmov.u16	r0, q0l[0]
 	bx		lr
-
 ENDPROC(crc_t10dif_pmull64)
 
+ENTRY(crc_t10dif_pmull8)
+	push		{r4, lr}
+	mov_l		r4, .L16x64perm
+
+	crct10dif	p8
+
+CPU_LE(	vrev64.8	q7, q7	)
+	vswp		q7l, q7h
+	vst1.64		{q7}, [r3, :128]
+	pop		{r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
 	.section	".rodata", "a"
 	.align		4
 
@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
+
+.L16x64perm:
+	.quad		0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index c9b95c48a1cf..c7425695ba53 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -20,6 +20,8 @@
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
 asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+				  u8 out[16]);
 
 static int crct10dif_init(struct shash_desc *desc)
 {
@@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
 	return 0;
 }
 
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+			         unsigned int length)
+{
+	u16 *crcp = shash_desc_ctx(desc);
+	u8 buf[16] __aligned(16);
+	u16 crc = *crcp;
+
+	if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc_t10dif_pmull8(crc, data, length, buf);
+		kernel_neon_end();
+
+		crc = 0;
+		data = buf;
+		length = sizeof(buf);
+	}
+
+	*crcp = crc_t10dif_generic(crc, data, length);
+	return 0;
+}
+
 static int crct10dif_final(struct shash_desc *desc, u8 *out)
 {
 	u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update_neon,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm-neon",
+	.base.cra_priority	= 150,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
 	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
 	.init			= crct10dif_init,
 	.update			= crct10dif_update_ce,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
-};
+}};
 
 static int __init crc_t10dif_mod_init(void)
 {
-	if (!(elf_hwcap2 & HWCAP2_PMULL))
+	if (!(elf_hwcap & HWCAP_NEON))
 		return -ENODEV;
 
-	return crypto_register_shash(&crc_t10dif_alg);
+	return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
 }
 
 static void __exit crc_t10dif_mod_exit(void)
 {
-	crypto_unregister_shash(&crc_t10dif_alg);
+	crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
 }
 
 module_init(crc_t10dif_mod_init);
-- 
Gitee


From 4a76de9d2020bc98ac74f8df9bea3b3452c4f8e5 Mon Sep 17 00:00:00 2001
From: Xiao Wang <xiao.w.wang@intel.com>
Date: Fri, 21 Jun 2024 13:47:07 +0800
Subject: [PATCH 39/59] riscv: Optimize crc32 with Zbc extension

commit a43fe27d650375cd9e5ea915c538f6f9eabd185e upstream

As suggested by the B-ext spec, the Zbc (carry-less multiplication)
instructions can be used to accelerate CRC calculations. Currently, the
crc32 is the most widely used crc function inside kernel, so this patch
focuses on the optimization of just the crc32 APIs.

Compared with the current table-lookup based optimization, Zbc based
optimization can also achieve large stride during CRC calculation loop,
meantime, it avoids the memory access latency of the table-lookup based
implementation and it reduces memory footprint.

If Zbc feature is not supported in a runtime environment, then the
table-lookup based implementation would serve as fallback via alternative
mechanism.

By inspecting the vmlinux built by gcc v12.2.0 with default optimization
level (-O2), we can see below instruction count change for each 8-byte
stride in the CRC32 loop:

rv64: crc32_be (54->31), crc32_le (54->13), __crc32c_le (54->13)
rv32: crc32_be (50->32), crc32_le (50->16), __crc32c_le (50->16)

The compile target CPU is little endian, extra effort is needed for byte
swapping for the crc32_be API, thus, the instruction count change is not
as significant as that in the *_le cases.

This patch is tested on QEMU VM with the kernel CRC32 selftest for both
rv64 and rv32. Running the CRC32 selftest on a real hardware (SpacemiT K1)
with Zbc extension shows 65% and 125% performance improvement respectively
on crc32_test() and crc32c_test().

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240621054707.1847548-1-xiao.w.wang@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/riscv/Kconfig      |  23 ++++
 arch/riscv/lib/Makefile |   1 +
 arch/riscv/lib/crc32.c  | 294 ++++++++++++++++++++++++++++++++++++++++
 include/linux/crc32.h   |   3 +
 4 files changed, 321 insertions(+)
 create mode 100644 arch/riscv/lib/crc32.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 1304992232ad..7137722167cd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -547,6 +547,29 @@ config RISCV_ISA_ZBB
 
 	   If you don't know what to do here, say Y.
 
+config TOOLCHAIN_HAS_ZBC
+	bool
+	default y
+	depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbc)
+	depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbc)
+	depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+	depends on AS_HAS_OPTION_ARCH
+
+config RISCV_ISA_ZBC
+	bool "Zbc extension support for carry-less multiplication instructions"
+	depends on TOOLCHAIN_HAS_ZBC
+	depends on MMU
+	depends on RISCV_ALTERNATIVE
+	default y
+	help
+	   Adds support to dynamically detect the presence of the Zbc
+	   extension (carry-less multiplication) and enable its usage.
+
+	   The Zbc extension could accelerate CRC (cyclic redundancy check)
+	   calculations.
+
+	   If you don't know what to do here, say Y.
+
 config RISCV_ISA_ZICBOM
 	bool "Zicbom extension support for non-coherent DMA operation"
 	depends on MMU
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 26cb2502ecf8..183bf2097d57 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -9,5 +9,6 @@ lib-y			+= strncmp.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
+lib-$(CONFIG_RISCV_ISA_ZBC)	+= crc32.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/riscv/lib/crc32.c b/arch/riscv/lib/crc32.c
new file mode 100644
index 000000000000..d7dc599af3ef
--- /dev/null
+++ b/arch/riscv/lib/crc32.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32 implementation with Zbc extension.
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+
+#include <linux/types.h>
+#include <linux/minmax.h>
+#include <linux/crc32poly.h>
+#include <linux/crc32.h>
+#include <linux/byteorder/generic.h>
+
+/*
+ * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
+ * better understanding of how this math works.
+ *
+ * let "+" denotes polynomial add (XOR)
+ * let "-" denotes polynomial sub (XOR)
+ * let "*" denotes polynomial multiplication
+ * let "/" denotes polynomial floor division
+ * let "S" denotes source data, XLEN bit wide
+ * let "P" denotes CRC32 polynomial
+ * let "T" denotes 2^(XLEN+32)
+ * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
+ *
+ * crc32(S, P)
+ * => S * (2^32) - S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
+ * => lowest 32 bits of: S * (2^32) * quotient / T * P
+ * => lowest 32 bits of: S * quotient / 2^XLEN * P
+ * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
+ * => clmul_low_part(clmul_high_part(S, QT) + S, P)
+ *
+ * In terms of below implementations, the BE case is more intuitive, since the
+ * higher order bit sits at more significant position.
+ */
+
+#if __riscv_xlen == 64
+/* Slide by XLEN bits per iteration */
+# define STEP_ORDER 3
+
+/* Each below polynomial quotient has an implicit bit for 2^XLEN */
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
+# define CRC32_POLY_QT_LE	0x5a72d812fb808b20
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
+# define CRC32C_POLY_QT_LE	0xa434f61c6f5389f8
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
+ * the same as the bit-reversed version of CRC32_POLY_QT_LE
+ */
+# define CRC32_POLY_QT_BE	0x04d101df481b4e5a
+
+static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+	return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+	u32 crc;
+
+	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmul	%0, %1, %2\n"
+		      "slli	%0, %0, 1\n"
+		      "xor	%0, %0, %1\n"
+		      "clmulr	%0, %0, %3\n"
+		      "srli	%0, %0, 32\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (poly_qt),
+			"r" ((u64)poly << 32)
+		      :);
+	return crc;
+}
+
+static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+	return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
+}
+
+#elif __riscv_xlen == 32
+# define STEP_ORDER 2
+/* Each quotient should match the upper half of its analog in RV64 */
+# define CRC32_POLY_QT_LE	0xfb808b20
+# define CRC32C_POLY_QT_LE	0x6f5389f8
+# define CRC32_POLY_QT_BE	0x04d101df
+
+static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+	return crc ^ (__force u32)__cpu_to_le32(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+	u32 crc;
+
+	/* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmul	%0, %1, %2\n"
+		      "slli	%0, %0, 1\n"
+		      "xor	%0, %0, %1\n"
+		      "clmulr	%0, %0, %3\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (poly_qt),
+			"r" (poly)
+		      :);
+	return crc;
+}
+
+static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+	return crc ^ (__force u32)__cpu_to_be32(*ptr);
+}
+
+#else
+# error "Unexpected __riscv_xlen"
+#endif
+
+static inline u32 crc32_be_zbc(unsigned long s)
+{
+	u32 crc;
+
+	asm volatile (".option push\n"
+		      ".option arch,+zbc\n"
+		      "clmulh	%0, %1, %2\n"
+		      "xor	%0, %0, %1\n"
+		      "clmul	%0, %0, %3\n"
+		      ".option pop\n"
+		      : "=&r" (crc)
+		      : "r" (s),
+			"r" (CRC32_POLY_QT_BE),
+			"r" (CRC32_POLY_BE)
+		      :);
+	return crc;
+}
+
+#define STEP		(1 << STEP_ORDER)
+#define OFFSET_MASK	(STEP - 1)
+
+typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
+				     size_t len, u32 poly,
+				     unsigned long poly_qt)
+{
+	size_t bits = len * 8;
+	unsigned long s = 0;
+	u32 crc_low = 0;
+
+	for (int i = 0; i < len; i++)
+		s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
+
+	s ^= (unsigned long)crc << (__riscv_xlen - bits);
+	if (__riscv_xlen == 32 || len < sizeof(u32))
+		crc_low = crc >> bits;
+
+	crc = crc32_le_zbc(s, poly, poly_qt);
+	crc ^= crc_low;
+
+	return crc;
+}
+
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+					  size_t len, u32 poly,
+					  unsigned long poly_qt,
+					  fallback crc_fb)
+{
+	size_t offset, head_len, tail_len;
+	unsigned long const *p_ul;
+	unsigned long s;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1)
+		 : : : : legacy);
+
+	/* Handle the unaligned head. */
+	offset = (unsigned long)p & OFFSET_MASK;
+	if (offset && len) {
+		head_len = min(STEP - offset, len);
+		crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
+		p += head_len;
+		len -= head_len;
+	}
+
+	tail_len = len & OFFSET_MASK;
+	len = len >> STEP_ORDER;
+	p_ul = (unsigned long const *)p;
+
+	for (int i = 0; i < len; i++) {
+		s = crc32_le_prep(crc, p_ul);
+		crc = crc32_le_zbc(s, poly, poly_qt);
+		p_ul++;
+	}
+
+	/* Handle the tail bytes. */
+	p = (unsigned char const *)p_ul;
+	if (tail_len)
+		crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
+
+	return crc;
+
+legacy:
+	return crc_fb(crc, p, len);
+}
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
+				crc32_le_base);
+}
+
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
+				CRC32C_POLY_QT_LE, __crc32c_le_base);
+}
+
+static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
+				     size_t len)
+{
+	size_t bits = len * 8;
+	unsigned long s = 0;
+	u32 crc_low = 0;
+
+	s = 0;
+	for (int i = 0; i < len; i++)
+		s = *p++ | (s << 8);
+
+	if (__riscv_xlen == 32 || len < sizeof(u32)) {
+		s ^= crc >> (32 - bits);
+		crc_low = crc << bits;
+	} else {
+		s ^= (unsigned long)crc << (bits - 32);
+	}
+
+	crc = crc32_be_zbc(s);
+	crc ^= crc_low;
+
+	return crc;
+}
+
+u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+{
+	size_t offset, head_len, tail_len;
+	unsigned long const *p_ul;
+	unsigned long s;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+			     RISCV_ISA_EXT_ZBC, 1)
+		 : : : : legacy);
+
+	/* Handle the unaligned head. */
+	offset = (unsigned long)p & OFFSET_MASK;
+	if (offset && len) {
+		head_len = min(STEP - offset, len);
+		crc = crc32_be_unaligned(crc, p, head_len);
+		p += head_len;
+		len -= head_len;
+	}
+
+	tail_len = len & OFFSET_MASK;
+	len = len >> STEP_ORDER;
+	p_ul = (unsigned long const *)p;
+
+	for (int i = 0; i < len; i++) {
+		s = crc32_be_prep(crc, p_ul);
+		crc = crc32_be_zbc(s);
+		p_ul++;
+	}
+
+	/* Handle the tail bytes. */
+	p = (unsigned char const *)p_ul;
+	if (tail_len)
+		crc = crc32_be_unaligned(crc, p, tail_len);
+
+	return crc;
+
+legacy:
+	return crc32_be_base(crc, p, len);
+}
diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 9e8a032c1788..87f788c0d607 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -9,7 +9,9 @@
 #include <linux/bitrev.h>
 
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
+u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len);
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
+u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len);
 
 /**
  * crc32_le_combine - Combine two crc32 check values into one. For two
@@ -37,6 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
 }
 
 u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len);
 
 /**
  * __crc32c_le_combine - Combine two crc32c check values into one. For two
-- 
Gitee


From 518b9c5832ab27b7c4843eccb947937329d6a581 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:49 +0200
Subject: [PATCH 40/59] arm64/lib: Handle CRC-32 alternative in C code

commit fc7454107d1b7c27bb98d3b109e5f44a8a46d7f8 upstream

In preparation for adding another code path for performing CRC-32, move
the alternative patching for ARM64_HAS_CRC32 into C code. The logic for
deciding whether to use this new code path will be implemented in C too.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241018075347.2821102-6-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/lib/Makefile     |  2 +-
 arch/arm64/lib/crc32-glue.c | 34 ++++++++++++++++++++++++++++++++++
 arch/arm64/lib/crc32.S      | 22 ++++++----------------
 3 files changed, 41 insertions(+), 17 deletions(-)
 create mode 100644 arch/arm64/lib/crc32-glue.c

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 9675335a7693..bf6c4c0d9a2f 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -16,7 +16,7 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
new file mode 100644
index 000000000000..0b51761d4b75
--- /dev/null
+++ b/arch/arm64/lib/crc32-glue.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crc32.h>
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_le_base(crc, p, len);
+
+	return crc32_le_arm64(crc, p, len);
+}
+
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return __crc32c_le_base(crc, p, len);
+
+	return crc32c_le_arm64(crc, p, len);
+}
+
+u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_be_base(crc, p, len);
+
+	return crc32_be_arm64(crc, p, len);
+}
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 8340dccff46f..22139691c7ae 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -6,7 +6,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 
 	.arch		armv8-a+crc
@@ -136,25 +135,16 @@ CPU_BE( rev16		\reg, \reg	)
 	.endm
 
 	.align		5
-SYM_FUNC_START(crc32_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_le_arm64)
 	__crc32
-SYM_FUNC_END(crc32_le)
+SYM_FUNC_END(crc32_le_arm64)
 
 	.align		5
-SYM_FUNC_START(__crc32c_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		__crc32c_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32c_le_arm64)
 	__crc32		c
-SYM_FUNC_END(__crc32c_le)
+SYM_FUNC_END(crc32c_le_arm64)
 
 	.align		5
-SYM_FUNC_START(crc32_be)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_be_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_be_arm64)
 	__crc32		be=1
-SYM_FUNC_END(crc32_be)
+SYM_FUNC_END(crc32_be_arm64)
-- 
Gitee


From 751c182787f3d71c7a3bfc5ead2dcd736bb59df5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:50 +0200
Subject: [PATCH 41/59] arm64/crc32: Reorganize bit/byte ordering macros

commit b98b23e19492f4009070761c53b755f623f60e49 upstream

In preparation for a new user, reorganize the bit/byte ordering macros
that are used to parameterize the crc32 template code and instantiate
CRC-32, CRC-32c and 'big endian' CRC-32.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20241018075347.2821102-7-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/lib/crc32.S | 91 ++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 22139691c7ae..f9920492f135 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -10,44 +10,48 @@
 
 	.arch		armv8-a+crc
 
-	.macro		byteorder, reg, be
-	.if		\be
-CPU_LE( rev		\reg, \reg	)
-	.else
-CPU_BE( rev		\reg, \reg	)
-	.endif
+	.macro		bitle, reg
 	.endm
 
-	.macro		byteorder16, reg, be
-	.if		\be
-CPU_LE( rev16		\reg, \reg	)
-	.else
-CPU_BE( rev16		\reg, \reg	)
-	.endif
+	.macro		bitbe, reg
+	rbit		\reg, \reg
 	.endm
 
-	.macro		bitorder, reg, be
-	.if		\be
-	rbit		\reg, \reg
-	.endif
+	.macro		bytele, reg
 	.endm
 
-	.macro		bitorder16, reg, be
-	.if		\be
+	.macro		bytebe, reg
 	rbit		\reg, \reg
-	lsr		\reg, \reg, #16
-	.endif
+	lsr		\reg, \reg, #24
+	.endm
+
+	.macro		hwordle, reg
+CPU_BE(	rev16		\reg, \reg	)
 	.endm
 
-	.macro		bitorder8, reg, be
-	.if		\be
+	.macro		hwordbe, reg
+CPU_LE(	rev		\reg, \reg	)
 	rbit		\reg, \reg
-	lsr		\reg, \reg, #24
-	.endif
+CPU_BE(	lsr		\reg, \reg, #16	)
+	.endm
+
+	.macro		le, regs:vararg
+	.irp		r, \regs
+CPU_BE(	rev		\r, \r		)
+	.endr
+	.endm
+
+	.macro		be, regs:vararg
+	.irp		r, \regs
+CPU_LE(	rev		\r, \r		)
+	.endr
+	.irp		r, \regs
+	rbit		\r, \r
+	.endr
 	.endm
 
-	.macro		__crc32, c, be=0
-	bitorder	w0, \be
+	.macro		__crc32, c, order=le
+	bit\order	w0
 	cmp		x2, #16
 	b.lt		8f			// less than 16 bytes
 
@@ -60,14 +64,7 @@ CPU_BE( rev16		\reg, \reg	)
 	add		x8, x8, x1
 	add		x1, x1, x7
 	ldp		x5, x6, [x8]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 
 	tst		x7, #8
 	crc32\c\()x	w8, w0, x3
@@ -95,42 +92,32 @@ CPU_BE( rev16		\reg, \reg	)
 32:	ldp		x3, x4, [x1], #32
 	sub		x2, x2, #32
 	ldp		x5, x6, [x1, #-16]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 	crc32\c\()x	w0, w0, x3
 	crc32\c\()x	w0, w0, x4
 	crc32\c\()x	w0, w0, x5
 	crc32\c\()x	w0, w0, x6
 	cbnz		x2, 32b
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 
 8:	tbz		x2, #3, 4f
 	ldr		x3, [x1], #8
-	byteorder	x3, \be
-	bitorder	x3, \be
+	\order		x3
 	crc32\c\()x	w0, w0, x3
 4:	tbz		x2, #2, 2f
 	ldr		w3, [x1], #4
-	byteorder	w3, \be
-	bitorder	w3, \be
+	\order		w3
 	crc32\c\()w	w0, w0, w3
 2:	tbz		x2, #1, 1f
 	ldrh		w3, [x1], #2
-	byteorder16	w3, \be
-	bitorder16	w3, \be
+	hword\order	w3
 	crc32\c\()h	w0, w0, w3
 1:	tbz		x2, #0, 0f
 	ldrb		w3, [x1]
-	bitorder8	w3, \be
+	byte\order	w3
 	crc32\c\()b	w0, w0, w3
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 	.endm
 
@@ -146,5 +133,5 @@ SYM_FUNC_END(crc32c_le_arm64)
 
 	.align		5
 SYM_FUNC_START(crc32_be_arm64)
-	__crc32		be=1
+	__crc32		order=be
 SYM_FUNC_END(crc32_be_arm64)
-- 
Gitee


From 95a64dd1c06bb5ece8a26b3124a757a61bff7884 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:51 +0200
Subject: [PATCH 42/59] arm64/crc32: Implement 4-way interleave using PMULL

commit a6478d69cf56d5deb4c28a6486376d9c7895abec upstream

Now that kernel mode NEON no longer disables preemption, using FP/SIMD
in library code which is not obviously part of the crypto subsystem is
no longer problematic, as it will no longer incur unexpected latencies.

So accelerate the CRC-32 library code on arm64 to use a 4-way
interleave, using PMULL instructions to implement the folding.

On Apple M2, this results in a speedup of 2 - 2.8x when using input
sizes of 1k - 8k. For smaller sizes, the overhead of preserving and
restoring the FP/SIMD register file may not be worth it, so 1k is used
as a threshold for choosing this code path.

The coefficient tables were generated using code provided by Eric. [0]

[0] https://github.com/ebiggers/libdeflate/blob/master/scripts/gen_crc32_multipliers.c

Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20241018075347.2821102-8-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jun He <jun.he@arm.com>
---
 arch/arm64/lib/crc32-glue.c |  48 ++++++++
 arch/arm64/lib/crc32.S      | 231 +++++++++++++++++++++++++++++++++++-
 2 files changed, 276 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
index 0b51761d4b75..295ae3e6b997 100644
--- a/arch/arm64/lib/crc32-glue.c
+++ b/arch/arm64/lib/crc32-glue.c
@@ -4,16 +4,40 @@
 #include <linux/linkage.h>
 
 #include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#include <crypto/internal/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
 
 asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
 
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_le_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32_le_arm64(crc, p, len);
 }
 
@@ -22,6 +46,18 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return __crc32c_le_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32c_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32c_le_arm64(crc, p, len);
 }
 
@@ -30,5 +66,17 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_be_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_be_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32_be_arm64(crc, p, len);
 }
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index f9920492f135..68825317460f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -1,14 +1,17 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Accelerated CRC32(C) using AArch64 CRC instructions
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
  *
- * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-	.arch		armv8-a+crc
+	.cpu		generic+crc+crypto
 
 	.macro		bitle, reg
 	.endm
@@ -135,3 +138,225 @@ SYM_FUNC_END(crc32c_le_arm64)
 SYM_FUNC_START(crc32_be_arm64)
 	__crc32		order=be
 SYM_FUNC_END(crc32_be_arm64)
+
+	in		.req	x1
+	len		.req	x2
+
+	/*
+	 * w0: input CRC at entry, output CRC at exit
+	 * x1: pointer to input buffer
+	 * x2: length of input in bytes
+	 */
+	.macro		crc4way, insn, table, order=le
+	bit\order	w0
+	lsr		len, len, #6		// len := # of 64-byte blocks
+
+	/* Process up to 64 blocks of 64 bytes at a time */
+.La\@:	mov		x3, #64
+	cmp		len, #64
+	csel		x3, x3, len, hi		// x3 := min(len, 64)
+	sub		len, len, x3
+
+	/* Divide the input into 4 contiguous blocks */
+	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
+	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
+	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
+	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4
+
+	/* Load the folding coefficients from the lookup table */
+	adr_l		x5, \table - 12		// entry 0 omitted
+	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
+	ldp		s0, s1, [x5]
+	ldr		s2, [x5, #8]
+
+	/* Zero init partial CRCs for this iteration */
+	mov		w4, wzr
+	mov		w5, wzr
+	mov		w6, wzr
+	mov		x17, xzr
+
+.Lb\@:	sub		x3, x3, #1
+	\insn		w6, w6, x17
+	ldp		x10, x11, [in], #16
+	ldp		x12, x13, [x7], #16
+	ldp		x14, x15, [x8], #16
+	ldp		x16, x17, [x9], #16
+
+	\order		x10, x11, x12, x13, x14, x15, x16, x17
+
+	/* Apply the CRC transform to 4 16-byte blocks in parallel */
+	\insn		w0, w0, x10
+	\insn		w4, w4, x12
+	\insn		w5, w5, x14
+	\insn		w6, w6, x16
+	\insn		w0, w0, x11
+	\insn		w4, w4, x13
+	\insn		w5, w5, x15
+	cbnz		x3, .Lb\@
+
+	/* Combine the 4 partial results into w0 */
+	mov		v3.d[0], x0
+	mov		v4.d[0], x4
+	mov		v5.d[0], x5
+	pmull		v0.1q, v0.1d, v3.1d
+	pmull		v1.1q, v1.1d, v4.1d
+	pmull		v2.1q, v2.1d, v5.1d
+	eor		v0.8b, v0.8b, v1.8b
+	eor		v0.8b, v0.8b, v2.8b
+	mov		x5, v0.d[0]
+	eor		x5, x5, x17
+	\insn		w0, w6, x5
+
+	mov		in, x9
+	cbnz		len, .La\@
+
+	bit\order	w0
+	ret
+	.endm
+
+	.align		5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+	crc4way		crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_le_arm64_4way)
+	crc4way		crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_be_arm64_4way)
+	crc4way		crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+	.section	.rodata, "a", %progbits
+	.align		6
+.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
+	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
+	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b
+	.long		0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+	.long		0x299847d5, 0x878a92a7, 0x39d3b296
+	.long		0xb6dd949b, 0xab7aff2a, 0x0715ce53
+	.long		0xa60ce07b, 0x83348832, 0x47db8317
+	.long		0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+	.long		0x65863b64, 0xb6dd949b, 0xc96cfdc0
+	.long		0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+	.long		0xf285651c, 0xce7f39f4, 0xdaece73e
+	.long		0x271d9844, 0xd270f1a2, 0xab7aff2a
+	.long		0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+	.long		0xcec3662e, 0x1b03397f, 0x83348832
+	.long		0x8227bb8a, 0xb3e32c28, 0x299847d5
+	.long		0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+	.long		0xf6076544, 0x10746f3c, 0x18b33a4e
+	.long		0x98d8d9cb, 0x271d9844, 0xb6dd949b
+	.long		0x57a3d037, 0x93a5f730, 0x78d9ccb7
+	.long		0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+	.long		0xe0ac139e, 0xcec3662e, 0xa60ce07b
+	.long		0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+	.long		0xa2b73df1, 0xb0cd4768, 0x61d82e56
+	.long		0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+	.long		0xa90fd27a, 0x0167d312, 0xc619809d
+	.long		0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+	.long		0x4597456a, 0x98d8d9cb, 0x65863b64
+	.long		0xc9c8b782, 0x68bce87a, 0x1b03397f
+	.long		0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+	.long		0x2342001e, 0x3771e98f, 0xb3e32c28
+	.long		0xe8b6368b, 0x2178513a, 0x064f7f26
+	.long		0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+	.long		0x0b0bf8ca, 0x6f345e45, 0xf285651c
+	.long		0x02ee03b2, 0xff0dba97, 0x10746f3c
+	.long		0x135c83fd, 0xf872e54c, 0xc7a68855
+	.long		0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+	.long		0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+	.long		0xded288f8, 0xb3af077a, 0x93a5f730
+	.long		0x37170390, 0xca6ef3ac, 0x6cb08e5c
+	.long		0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+	.long		0xb25b29f2, 0xe9e28eb4, 0x1393e203
+	.long		0x45cddf4e, 0xc9c8b782, 0xcec3662e
+	.long		0xdfd94fb2, 0x93e106a4, 0x96c515bb
+	.long		0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+	.long		0x8e1450f7, 0x2342001e, 0x8227bb8a
+	.long		0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+	.long		0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+	.long		0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+	.long		0x0cd1526a, 0xf2271e60, 0x0ab3844b
+	.long		0xd6c3a807, 0x2664fd8b, 0x0167d312
+	.long		0x1d31175f, 0x02ee03b2, 0xf6076544
+	.long		0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+	.long		0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+	.long		0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+	.long		0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+	.long		0x889774e1, 0xaa7c7ad5, 0x68bce87a
+	.long		0x8a074012, 0xded288f8, 0x57a3d037
+	.long		0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+	.long		0x3be3c09b, 0x6353c1cc, 0x42d98888
+	.long		0x465a4eee, 0xf48642e9, 0x3771e98f
+	.long		0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+	.long		0xa52f58ec, 0x9a5ede41, 0x2178513a
+	.long		0x47972100, 0x45cddf4e, 0xe0ac139e
+	.long		0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1:	.long		0xaf449247, 0x81256527, 0xccaa009e
+	.long		0x57c54819, 0x1d9513d7, 0x81256527
+	.long		0x3f41287a, 0x57c54819, 0xaf449247
+	.long		0xf5e48c85, 0x910eeec1, 0x1d9513d7
+	.long		0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+	.long		0x71d54a59, 0xf5e48c85, 0x57c54819
+	.long		0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+	.long		0xd31343ea, 0xe95c1271, 0x910eeec1
+	.long		0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+	.long		0x9ee62949, 0xcec97417, 0x9026d5b1
+	.long		0xa55d1514, 0xf183c71b, 0xd1df2327
+	.long		0x21aa2b26, 0xd31343ea, 0xf5e48c85
+	.long		0x9d842b80, 0xeea395c4, 0x3c656ced
+	.long		0xd8110ff1, 0xcd669a40, 0xfe807bbd
+	.long		0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+	.long		0x1d6708a0, 0x0c30f51d, 0xe95c1271
+	.long		0xef82aa68, 0xdb3935ea, 0xb918a347
+	.long		0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+	.long		0x99cce860, 0x356d209f, 0xff6f2fc2
+	.long		0xd8af8e46, 0xc352f6de, 0xcec97417
+	.long		0xf1996890, 0xd8110ff1, 0x1c63267b
+	.long		0x631bc508, 0xe95c7216, 0xf183c71b
+	.long		0x8511c306, 0x8e031a19, 0x9b9bdbd0
+	.long		0xdb3839f3, 0x1d6708a0, 0xd31343ea
+	.long		0x7a92fffb, 0xf7003835, 0x4470ac44
+	.long		0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+	.long		0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+	.long		0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+	.long		0x60290934, 0x81b6f443, 0x6d40f445
+	.long		0x8e976a7d, 0xd8af8e46, 0x9ee62949
+	.long		0xdcf5088a, 0x9dbdc100, 0x145575d5
+	.long		0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+	.long		0x255b139e, 0x631bc508, 0xa55d1514
+	.long		0xd784eaa8, 0xce26786c, 0xdb3935ea
+	.long		0x6d2c864a, 0x8068c345, 0x2586d334
+	.long		0x02072e24, 0xdb3839f3, 0x21aa2b26
+	.long		0x06689b0a, 0x5efd72f5, 0xe0575528
+	.long		0x1e52f5ea, 0x4117915b, 0x356d209f
+	.long		0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+	.long		0x3796455c, 0xb8e0e4a8, 0xc352f6de
+	.long		0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+	.long		0x28ae0976, 0xb46f7cff, 0xd8110ff1
+	.long		0x9764bc8d, 0xd7e7a22c, 0x712510f0
+	.long		0x13a13e18, 0x3e9a43cd, 0xe95c7216
+	.long		0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+	.long		0x0c540e7b, 0x753c81ff, 0x8e031a19
+	.long		0x9924c781, 0xb9220208, 0x3edcde65
+	.long		0x3954de39, 0x1753ab84, 0x1d6708a0
+	.long		0xf32238b5, 0xbec81497, 0x9e70b943
+	.long		0xbbd2cd2c, 0x0925d861, 0xf7003835
+	.long		0xcc401304, 0xd784eaa8, 0xef82aa68
+	.long		0x4987e684, 0x6044fbb0, 0x00eba0c8
+	.long		0x3aa11427, 0x18fe3b4a, 0x87441142
+	.long		0x297aad60, 0x02072e24, 0xd14bcc9b
+	.long		0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+	.long		0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+	.long		0x25b8822a, 0x1e52f5ea, 0x99cce860
+	.long		0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+	.long		0x5690aa32, 0xa91fdefb, 0x688a110e
+	.long		0x1357a093, 0x3796455c, 0xd8af8e46
+	.long		0x798fdd33, 0xaaa18a37, 0x357b9517
+	.long		0xc2815395, 0x54d42691, 0x9dbdc100
+	.long		0x21cfc0f7, 0x28ae0976, 0xf1996890
+	.long		0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
-- 
Gitee


From 1c4ab93076d44471d8890f0c4333ada3985d5294 Mon Sep 17 00:00:00 2001
From: Xie Xiaodong <xiexiaodong16@huawei.com>
Date: Tue, 20 Jan 2026 15:45:49 +0800
Subject: [PATCH 43/59] KVM: Add missing macro guards for virt_dev irqbypass

Missing some CONFIG_* macro guards for feature dependencies.
fix(b789f735 KVM: arm64: Introduce shadow device)

Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/arm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b58a7c46d993..b1089e291bbb 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -457,6 +457,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_ARM64_HDBSS
 	case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
 		r = system_supports_hdbss();
+		break;
+#endif
 #ifdef CONFIG_VIRT_PLAT_DEV
 	case KVM_CAP_ARM_VIRT_MSI_BYPASS:
 		r = sdev_enable;
-- 
Gitee


From b94f1f42cf59fbc9961fe9fda821ab85e4e4711b Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Tue, 6 Feb 2024 21:34:53 +0800
Subject: [PATCH 44/59] KVM: arm64: Probe Hisi CPU TYPE from ACPI/DTB

commit dab235e28e1c02e4309d8099790d965f9b943323 openEuler

Parse ACPI/DTB to get where the hypervisor is running.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/hisi_cpu_model.h | 19 ++++++
 arch/arm64/include/asm/kvm_host.h       |  1 +
 arch/arm64/kvm/Makefile                 |  1 +
 arch/arm64/kvm/arm.c                    |  6 ++
 arch/arm64/kvm/hisi_cpu_model.c         | 83 +++++++++++++++++++++++++
 5 files changed, 110 insertions(+)
 create mode 100644 arch/arm64/include/asm/hisi_cpu_model.h
 create mode 100644 arch/arm64/kvm/hisi_cpu_model.c

diff --git a/arch/arm64/include/asm/hisi_cpu_model.h b/arch/arm64/include/asm/hisi_cpu_model.h
new file mode 100644
index 000000000000..003a3a53cf33
--- /dev/null
+++ b/arch/arm64/include/asm/hisi_cpu_model.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright(c) 2019 Huawei Technologies Co., Ltd
+ */
+
+#ifndef __HISI_CPU_MODEL_H__
+#define __HISI_CPU_MODEL_H__
+
+enum hisi_cpu_type {
+	HI_1612,
+	HI_1616,
+	HI_1620,
+	UNKNOWN_HI_TYPE
+};
+
+extern enum hisi_cpu_type hi_cpu_type;
+
+void probe_hisi_cpu_type(void);
+#endif /* __HISI_CPU_MODEL_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ce9908fe0acf..3c5917d0a656 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -28,6 +28,7 @@
 #include <asm/kvm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_rme.h>
+#include <asm/hisi_cpu_model.h>
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 9221291d99a6..203288bbae11 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -15,6 +15,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 guest.o debug.o reset.o sys_regs.o stacktrace.o \
 	 vgic-sys-reg-v3.o fpsimd.o pkvm.o \
 	 arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
+	 hisi_cpu_model.o \
 	 vgic/vgic.o vgic/vgic-init.o \
 	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
 	 vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b1089e291bbb..3e76b02cf80b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -60,6 +60,9 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 
 static bool vgic_present, kvm_arm_initialised;
 
+/* Hisi cpu type enum */
+enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE;
+
 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
 
 bool is_kvm_arm_initialised(void)
@@ -2756,6 +2759,9 @@ static __init int kvm_arm_init(void)
 		return err;
 	}
 
+	/* Probe the Hisi CPU type */
+	probe_hisi_cpu_type();
+
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
 	if (in_hyp_mode)
diff --git a/arch/arm64/kvm/hisi_cpu_model.c b/arch/arm64/kvm/hisi_cpu_model.c
new file mode 100644
index 000000000000..4d5a099bc27a
--- /dev/null
+++ b/arch/arm64/kvm/hisi_cpu_model.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright(c) 2019 Huawei Technologies Co., Ltd
+ */
+
+#include <linux/acpi.h>
+#include <linux/of.h>
+#include <linux/init.h>
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_ACPI
+
+/* ACPI Hisi oem table id str */
+const char *oem_str[] = {
+	"HIP06",	/* Hisi 1612 */
+	"HIP07",	/* Hisi 1616 */
+	"HIP08"		/* Hisi 1620 */
+};
+
+/*
+ * Get Hisi oem table id.
+ */
+static void acpi_get_hw_cpu_type(void)
+{
+	struct acpi_table_header *table;
+	acpi_status status;
+	int i, str_size = ARRAY_SIZE(oem_str);
+
+	/* Get oem table id from ACPI table header */
+	status = acpi_get_table(ACPI_SIG_DSDT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		pr_err("Failed to get ACPI table: %s\n",
+		       acpi_format_exception(status));
+		return;
+	}
+
+	for (i = 0; i < str_size; ++i) {
+		if (!strncmp(oem_str[i], table->oem_table_id, 5)) {
+			hi_cpu_type = i;
+			return;
+		}
+	}
+}
+
+#else
+static void acpi_get_hw_cpu_type(void) {}
+#endif
+
+/* of Hisi cpu model str */
+const char *of_model_str[] = {
+	"Hi1612",
+	"Hi1616"
+};
+
+static void of_get_hw_cpu_type(void)
+{
+	const char *cpu_type;
+	int ret, i, str_size = ARRAY_SIZE(of_model_str);
+
+	ret = of_property_read_string(of_root, "model", &cpu_type);
+	if (ret < 0) {
+		pr_err("Failed to get Hisi cpu model by OF.\n");
+		return;
+	}
+
+	for (i = 0; i < str_size; ++i) {
+		if (strstr(cpu_type, of_model_str[i])) {
+			hi_cpu_type = i;
+			return;
+		}
+	}
+}
+
+void probe_hisi_cpu_type(void)
+{
+	if (!acpi_disabled)
+		acpi_get_hw_cpu_type();
+	else
+		of_get_hw_cpu_type();
+
+	if (hi_cpu_type == UNKNOWN_HI_TYPE)
+		pr_warn("UNKNOWN Hisi cpu type.\n");
+}
-- 
Gitee


From 7e2664e50a1a841436bb1025a3794c52e362d052 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Tue, 6 Feb 2024 21:34:54 +0800
Subject: [PATCH 45/59] KVM: arm64: Add support for probing Hisi ncsnp
 capability

commit 625086f87673687dc420e79c9fedff0c2d65cb49 openEuler

Kunpeng 920 offers the HHA ncsnp capability, with which hypervisor doesn't
need to perform a lot of cache maintenance like before (in case the guest
has some non-cacheable Stage-1 mappings). Currently we apply this hardware
capability when

 - vCPU switching MMU+caches on/off
 - creating Stage-2 mappings for Daborts

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/hisi_cpu_model.h |  2 ++
 arch/arm64/include/asm/kvm_mmu.h        |  2 +-
 arch/arm64/kvm/arm.c                    |  2 ++
 arch/arm64/kvm/hisi_cpu_model.c         | 34 +++++++++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c            |  2 +-
 5 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/hisi_cpu_model.h b/arch/arm64/include/asm/hisi_cpu_model.h
index 003a3a53cf33..67008d17416e 100644
--- a/arch/arm64/include/asm/hisi_cpu_model.h
+++ b/arch/arm64/include/asm/hisi_cpu_model.h
@@ -14,6 +14,8 @@ enum hisi_cpu_type {
 };
 
 extern enum hisi_cpu_type hi_cpu_type;
+extern bool kvm_ncsnp_support;
 
 void probe_hisi_cpu_type(void);
+void probe_hisi_ncsnp_support(void);
 #endif /* __HISI_CPU_MODEL_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 5918769294ed..4f7e9711a4af 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -218,7 +218,7 @@ static inline void __clean_dcache_guest_page(void *va, size_t size)
 	 * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
 	 * PoU is not required either in this case.
 	 */
-	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+	if (kvm_ncsnp_support || cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
 		return;
 
 	kvm_flush_dcache_to_poc(va, size);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3e76b02cf80b..51d09197b9ff 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -62,6 +62,7 @@ static bool vgic_present, kvm_arm_initialised;
 
 /* Hisi cpu type enum */
 enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE;
+bool kvm_ncsnp_support;
 
 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
 
@@ -2761,6 +2762,7 @@ static __init int kvm_arm_init(void)
 
 	/* Probe the Hisi CPU type */
 	probe_hisi_cpu_type();
+	probe_hisi_ncsnp_support();
 
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
diff --git a/arch/arm64/kvm/hisi_cpu_model.c b/arch/arm64/kvm/hisi_cpu_model.c
index 4d5a099bc27a..52eecf1ba1cf 100644
--- a/arch/arm64/kvm/hisi_cpu_model.c
+++ b/arch/arm64/kvm/hisi_cpu_model.c
@@ -81,3 +81,37 @@ void probe_hisi_cpu_type(void)
 	if (hi_cpu_type == UNKNOWN_HI_TYPE)
 		pr_warn("UNKNOWN Hisi cpu type.\n");
 }
+
+#define NCSNP_MMIO_BASE	0x20107E238
+
+/*
+ * We have the fantastic HHA ncsnp capability on Kunpeng 920,
+ * with which hypervisor doesn't need to perform a lot of cache
+ * maintenance like before (in case the guest has non-cacheable
+ * Stage-1 mappings).
+ */
+void probe_hisi_ncsnp_support(void)
+{
+	void __iomem *base;
+	unsigned int high;
+
+	kvm_ncsnp_support = false;
+
+	if (hi_cpu_type != HI_1620)
+		goto out;
+
+	base = ioremap(NCSNP_MMIO_BASE, 4);
+	if (!base) {
+		pr_err("Unable to map MMIO region when probing ncsnp!\n");
+		goto out;
+	}
+
+	high = readl_relaxed(base) >> 28;
+	iounmap(base);
+	if (high != 0x1)
+		kvm_ncsnp_support = true;
+
+out:
+	kvm_info("Hisi ncsnp: %s\n", kvm_ncsnp_support ? "enabled" :
+							 "disabled");
+}
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c624e6dd54df..0ebe8ccae5a8 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1361,7 +1361,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
 		.arg	= pgt,
 	};
 
-	if (stage2_has_fwb(pgt))
+	if (kvm_ncsnp_support || stage2_has_fwb(pgt))
 		return 0;
 
 	return kvm_pgtable_walk(pgt, addr, size, &walker);
-- 
Gitee


From 17bdbc06ad82d3c2c9ab320697f6efafba75da69 Mon Sep 17 00:00:00 2001
From: Yanan Wang <wangyanan55@huawei.com>
Date: Tue, 6 Feb 2024 21:34:55 +0800
Subject: [PATCH 46/59] KVM: arm64: Only probe Hisi ncsnp feature on Hisi CPUs

commit 029e24ef7b172df5c4c693966fd50cb85b54ebfb openEuler

The "ncsnp" is an implementation specific CPU virtualization
feature on Hisi 1620 series CPUs. This feature works just
like ARM standard S2FWB to reduce some cache management
operations in virtualization.
Given that it's Hisi specific feature, let's restrict the
detection only to Hisi CPUs. To realize this:
1) Add a sub-directory `hisilicon/` within arch/arm64/kvm to
hold code for Hisi specific virtualization features.
2) Add a new kconfig option `CONFIG_KVM_HISI_VIRT` for users
to select the whole Hisi specific virtualization features.
3) Add a generic global KVM variable `kvm_ncsnp_support`
which is `false` by default. Only re-initialize it when
we have `CONFIG_KVM_HISI_VIRT` enabled.

Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>

resolve conflict of config
---
 arch/arm64/include/asm/kvm_host.h             |  3 +-
 arch/arm64/kernel/image-vars.h                |  5 +
 arch/arm64/kvm/Kconfig                        |  1 +
 arch/arm64/kvm/Makefile                       |  2 +-
 arch/arm64/kvm/arm.c                          | 13 ++-
 arch/arm64/kvm/hisilicon/Kconfig              |  7 ++
 arch/arm64/kvm/hisilicon/Makefile             |  2 +
 .../hisi_virt.c}                              | 93 ++++++++++---------
 .../hisilicon/hisi_virt.h}                    | 14 ++-
 9 files changed, 83 insertions(+), 57 deletions(-)
 create mode 100644 arch/arm64/kvm/hisilicon/Kconfig
 create mode 100644 arch/arm64/kvm/hisilicon/Makefile
 rename arch/arm64/kvm/{hisi_cpu_model.c => hisilicon/hisi_virt.c} (44%)
 rename arch/arm64/{include/asm/hisi_cpu_model.h => kvm/hisilicon/hisi_virt.h} (39%)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3c5917d0a656..c7c52239da7d 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -28,7 +28,6 @@
 #include <asm/kvm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_rme.h>
-#include <asm/hisi_cpu_model.h>
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
@@ -1243,4 +1242,6 @@ extern unsigned int twedel;
 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
 
+extern bool kvm_ncsnp_support;
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index d10d3fed31d9..82957f93fa21 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -111,6 +111,11 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
 /* pKVM static key */
 KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
 
+#ifdef CONFIG_KVM_HISI_VIRT
+/* Capability of non-cacheable snooping */
+KVM_NVHE_ALIAS(kvm_ncsnp_support);
+#endif
+
 #endif /* CONFIG_KVM */
 
 #ifdef CONFIG_EFI_ZBOOT
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 1a777715199f..99cda39ac868 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -5,6 +5,7 @@
 
 source "virt/lib/Kconfig"
 source "virt/kvm/Kconfig"
+source "arch/arm64/kvm/hisilicon/Kconfig"
 
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 203288bbae11..d2ccd538e2c2 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -15,7 +15,6 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 guest.o debug.o reset.o sys_regs.o stacktrace.o \
 	 vgic-sys-reg-v3.o fpsimd.o pkvm.o \
 	 arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
-	 hisi_cpu_model.o \
 	 vgic/vgic.o vgic/vgic-init.o \
 	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
 	 vgic/vgic-v3.o vgic/vgic-v4.o \
@@ -26,6 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 
 kvm-$(CONFIG_VIRT_PLAT_DEV)  += vgic/shadow_dev.o
 kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
+obj-$(CONFIG_KVM_HISI_VIRT) += hisilicon/
 
 always-y := hyp_constants.h hyp-constants.s
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 51d09197b9ff..83b5e86f3188 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -51,6 +51,10 @@ static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
 
 DEFINE_STATIC_KEY_FALSE(kvm_rme_is_available);
 
+#ifdef CONFIG_KVM_HISI_VIRT
+#include "hisilicon/hisi_virt.h"
+#endif
+
 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
 
 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
@@ -60,8 +64,7 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 
 static bool vgic_present, kvm_arm_initialised;
 
-/* Hisi cpu type enum */
-enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE;
+/* Capability of non-cacheable snooping */
 bool kvm_ncsnp_support;
 
 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
@@ -2760,9 +2763,11 @@ static __init int kvm_arm_init(void)
 		return err;
 	}
 
-	/* Probe the Hisi CPU type */
+#ifdef CONFIG_KVM_HISI_VIRT
 	probe_hisi_cpu_type();
-	probe_hisi_ncsnp_support();
+	kvm_ncsnp_support = hisi_ncsnp_supported();
+#endif
+	kvm_info("KVM ncsnp %s\n", kvm_ncsnp_support ? "enabled" : "disabled");
 
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
diff --git a/arch/arm64/kvm/hisilicon/Kconfig b/arch/arm64/kvm/hisilicon/Kconfig
new file mode 100644
index 000000000000..6536f897a32e
--- /dev/null
+++ b/arch/arm64/kvm/hisilicon/Kconfig
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config KVM_HISI_VIRT
+	bool "HiSilicon SoC specific virtualization features"
+	depends on ARCH_HISI
+	help
+	  Support for HiSilicon SoC specific virtualization features.
+	  On non-HiSilicon platforms, say N here.
diff --git a/arch/arm64/kvm/hisilicon/Makefile b/arch/arm64/kvm/hisilicon/Makefile
new file mode 100644
index 000000000000..849f99d1526d
--- /dev/null
+++ b/arch/arm64/kvm/hisilicon/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_KVM_HISI_VIRT) += hisi_virt.o
diff --git a/arch/arm64/kvm/hisi_cpu_model.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
similarity index 44%
rename from arch/arm64/kvm/hisi_cpu_model.c
rename to arch/arm64/kvm/hisilicon/hisi_virt.c
index 52eecf1ba1cf..9587f9508a79 100644
--- a/arch/arm64/kvm/hisi_cpu_model.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -1,26 +1,34 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Copyright(c) 2019 Huawei Technologies Co., Ltd
+ * Copyright(c) 2022 Huawei Technologies Co., Ltd
  */
 
 #include <linux/acpi.h>
 #include <linux/of.h>
 #include <linux/init.h>
 #include <linux/kvm_host.h>
+#include "hisi_virt.h"
 
-#ifdef CONFIG_ACPI
+static enum hisi_cpu_type cpu_type = UNKNOWN_HI_TYPE;
+
+static const char * const hisi_cpu_type_str[] = {
+	"Hisi1612",
+	"Hisi1616",
+	"Hisi1620",
+	"Unknown"
+};
 
 /* ACPI Hisi oem table id str */
-const char *oem_str[] = {
+static const char * const oem_str[] = {
 	"HIP06",	/* Hisi 1612 */
 	"HIP07",	/* Hisi 1616 */
 	"HIP08"		/* Hisi 1620 */
 };
 
 /*
- * Get Hisi oem table id.
+ * Probe Hisi CPU type form ACPI.
  */
-static void acpi_get_hw_cpu_type(void)
+static enum hisi_cpu_type acpi_get_hisi_cpu_type(void)
 {
 	struct acpi_table_header *table;
 	acpi_status status;
@@ -29,89 +37,88 @@ static void acpi_get_hw_cpu_type(void)
 	/* Get oem table id from ACPI table header */
 	status = acpi_get_table(ACPI_SIG_DSDT, 0, &table);
 	if (ACPI_FAILURE(status)) {
-		pr_err("Failed to get ACPI table: %s\n",
-		       acpi_format_exception(status));
-		return;
+		pr_warn("Failed to get ACPI table: %s\n",
+			acpi_format_exception(status));
+		return UNKNOWN_HI_TYPE;
 	}
 
 	for (i = 0; i < str_size; ++i) {
-		if (!strncmp(oem_str[i], table->oem_table_id, 5)) {
-			hi_cpu_type = i;
-			return;
-		}
+		if (!strncmp(oem_str[i], table->oem_table_id, 5))
+			return i;
 	}
-}
 
-#else
-static void acpi_get_hw_cpu_type(void) {}
-#endif
+	return UNKNOWN_HI_TYPE;
+}
 
 /* of Hisi cpu model str */
-const char *of_model_str[] = {
+static const char * const of_model_str[] = {
 	"Hi1612",
 	"Hi1616"
 };
 
-static void of_get_hw_cpu_type(void)
+/*
+ * Probe Hisi CPU type from DT.
+ */
+static enum hisi_cpu_type of_get_hisi_cpu_type(void)
 {
-	const char *cpu_type;
+	const char *model;
 	int ret, i, str_size = ARRAY_SIZE(of_model_str);
 
-	ret = of_property_read_string(of_root, "model", &cpu_type);
+	/*
+	 * Note: There may not be a "model" node in FDT, which
+	 * is provided by the vendor. In this case, we are not
+	 * able to get CPU type information through this way.
+	 */
+	ret = of_property_read_string(of_root, "model", &model);
 	if (ret < 0) {
-		pr_err("Failed to get Hisi cpu model by OF.\n");
-		return;
+		pr_warn("Failed to get Hisi cpu model by OF.\n");
+		return UNKNOWN_HI_TYPE;
 	}
 
 	for (i = 0; i < str_size; ++i) {
-		if (strstr(cpu_type, of_model_str[i])) {
-			hi_cpu_type = i;
-			return;
-		}
+		if (strstr(model, of_model_str[i]))
+			return i;
 	}
+
+	return UNKNOWN_HI_TYPE;
 }
 
 void probe_hisi_cpu_type(void)
 {
 	if (!acpi_disabled)
-		acpi_get_hw_cpu_type();
+		cpu_type = acpi_get_hisi_cpu_type();
 	else
-		of_get_hw_cpu_type();
+		cpu_type = of_get_hisi_cpu_type();
 
-	if (hi_cpu_type == UNKNOWN_HI_TYPE)
-		pr_warn("UNKNOWN Hisi cpu type.\n");
+	kvm_info("detected: Hisi CPU type '%s'\n", hisi_cpu_type_str[cpu_type]);
 }
 
-#define NCSNP_MMIO_BASE	0x20107E238
-
 /*
  * We have the fantastic HHA ncsnp capability on Kunpeng 920,
  * with which hypervisor doesn't need to perform a lot of cache
  * maintenance like before (in case the guest has non-cacheable
  * Stage-1 mappings).
  */
-void probe_hisi_ncsnp_support(void)
+#define NCSNP_MMIO_BASE	0x20107E238
+bool hisi_ncsnp_supported(void)
 {
 	void __iomem *base;
 	unsigned int high;
+	bool supported = false;
 
-	kvm_ncsnp_support = false;
-
-	if (hi_cpu_type != HI_1620)
-		goto out;
+	if (cpu_type != HI_1620)
+		return supported;
 
 	base = ioremap(NCSNP_MMIO_BASE, 4);
 	if (!base) {
-		pr_err("Unable to map MMIO region when probing ncsnp!\n");
-		goto out;
+		pr_warn("Unable to map MMIO region when probing ncsnp!\n");
+		return supported;
 	}
 
 	high = readl_relaxed(base) >> 28;
 	iounmap(base);
 	if (high != 0x1)
-		kvm_ncsnp_support = true;
+		supported = true;
 
-out:
-	kvm_info("Hisi ncsnp: %s\n", kvm_ncsnp_support ? "enabled" :
-							 "disabled");
+	return supported;
 }
diff --git a/arch/arm64/include/asm/hisi_cpu_model.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
similarity index 39%
rename from arch/arm64/include/asm/hisi_cpu_model.h
rename to arch/arm64/kvm/hisilicon/hisi_virt.h
index 67008d17416e..c4b5acc93fec 100644
--- a/arch/arm64/include/asm/hisi_cpu_model.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -1,10 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Copyright(c) 2019 Huawei Technologies Co., Ltd
+ * Copyright(c) 2022 Huawei Technologies Co., Ltd
  */
 
-#ifndef __HISI_CPU_MODEL_H__
-#define __HISI_CPU_MODEL_H__
+#ifndef __HISI_VIRT_H__
+#define __HISI_VIRT_H__
 
 enum hisi_cpu_type {
 	HI_1612,
@@ -13,9 +13,7 @@ enum hisi_cpu_type {
 	UNKNOWN_HI_TYPE
 };
 
-extern enum hisi_cpu_type hi_cpu_type;
-extern bool kvm_ncsnp_support;
-
 void probe_hisi_cpu_type(void);
-void probe_hisi_ncsnp_support(void);
-#endif /* __HISI_CPU_MODEL_H__ */
+bool hisi_ncsnp_supported(void);
+
+#endif /* __HISI_VIRT_H__ */
-- 
Gitee


From 14e0a3ae4bdc0a059ca7d49d2645aff08f9c9432 Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Tue, 6 Feb 2024 21:34:56 +0800
Subject: [PATCH 47/59] KVM: arm64: Support a new HiSi CPU type

commit 8dea2b6c0541784a3f3356f3db1db1ecc3f5324b openEuler

Add a new entry ("HIP09") in oem_str[] to support detection of the new HiSi
CPU type.

Signed-off-by: Quan Zhou <zhouquan65@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 4 +++-
 arch/arm64/kvm/hisilicon/hisi_virt.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 9587f9508a79..90c363ed642e 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -15,6 +15,7 @@ static const char * const hisi_cpu_type_str[] = {
 	"Hisi1612",
 	"Hisi1616",
 	"Hisi1620",
+	"HIP09",
 	"Unknown"
 };
 
@@ -22,7 +23,8 @@ static const char * const hisi_cpu_type_str[] = {
 static const char * const oem_str[] = {
 	"HIP06",	/* Hisi 1612 */
 	"HIP07",	/* Hisi 1616 */
-	"HIP08"		/* Hisi 1620 */
+	"HIP08",	/* Hisi 1620 */
+	"HIP09"		/* HIP09 */
 };
 
 /*
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index c4b5acc93fec..9231b1dca7f2 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -10,6 +10,7 @@ enum hisi_cpu_type {
 	HI_1612,
 	HI_1616,
 	HI_1620,
+	HI_IP09,
 	UNKNOWN_HI_TYPE
 };
 
-- 
Gitee


From 4af3264f3e51e277bdada57b3737cbb8f58de783 Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Tue, 6 Feb 2024 21:34:57 +0800
Subject: [PATCH 48/59] KVM: arm64: Probe and configure DVMBM capability on
 HiSi CPUs

commit e85b97c7e2b40e2457665368b4b9779556efd4c5 openEuler

DVMBM is an virtualization extension since HIP09, which allows TLBI
executed at NS EL1 to be broadcast in a configurable range of physical
CPUs (even with HCR_EL2.FB set). It will bring TLBI broadcast optimization.

Introduce the method to detect and enable this feature. Also add a kernel
command parameter "kvm-arm.dvmbm_enabled" (=0 on default) so that users can
{en,dis}able DVMBM on need. The parameter description is added under
Documentation/.

Signed-off-by: Quan Zhou <zhouquan65@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 .../admin-guide/kernel-parameters.txt         |  4 ++
 arch/arm64/include/asm/kvm_host.h             |  1 +
 arch/arm64/kvm/arm.c                          |  6 +++
 arch/arm64/kvm/hisilicon/hisi_virt.c          | 49 +++++++++++++++++++
 arch/arm64/kvm/hisilicon/hisi_virt.h          |  6 +++
 5 files changed, 66 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 35be0d82daa3..08dc6ddb6f8d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2695,6 +2695,10 @@
 			[KVM,ARM] Allow use of GICv4 for direct injection of
 			LPIs.
 
+	kvm-arm.dvmbm_enabled=
+			[KVM,ARM] Allow use of HiSilicon DVMBM capability.
+			Default: 0
+
 	kvm_cma_resv_ratio=n [PPC]
 			Reserves given percentage from system memory area for
 			contiguous memory allocation for KVM hash pagetable
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index c7c52239da7d..392c8b5b95f2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1243,5 +1243,6 @@ void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
 
 extern bool kvm_ncsnp_support;
+extern bool kvm_dvmbm_support;
 
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 83b5e86f3188..b339b3dd79e8 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -67,6 +67,10 @@ static bool vgic_present, kvm_arm_initialised;
 /* Capability of non-cacheable snooping */
 bool kvm_ncsnp_support;
 
+/* Capability of DVMBM */
+bool kvm_dvmbm_support;
+
+
 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
 
 bool is_kvm_arm_initialised(void)
@@ -2766,8 +2770,10 @@ static __init int kvm_arm_init(void)
 #ifdef CONFIG_KVM_HISI_VIRT
 	probe_hisi_cpu_type();
 	kvm_ncsnp_support = hisi_ncsnp_supported();
+	kvm_dvmbm_support = hisi_dvmbm_supported();
 #endif
 	kvm_info("KVM ncsnp %s\n", kvm_ncsnp_support ? "enabled" : "disabled");
+	kvm_info("KVM dvmbm %s\n", kvm_dvmbm_support ? "enabled" : "disabled");
 
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 90c363ed642e..b81488cd663b 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -11,6 +11,8 @@
 
 static enum hisi_cpu_type cpu_type = UNKNOWN_HI_TYPE;
 
+static bool dvmbm_enabled;
+
 static const char * const hisi_cpu_type_str[] = {
 	"Hisi1612",
 	"Hisi1616",
@@ -124,3 +126,50 @@ bool hisi_ncsnp_supported(void)
 
 	return supported;
 }
+
+static int __init early_dvmbm_enable(char *buf)
+{
+	return strtobool(buf, &dvmbm_enabled);
+}
+early_param("kvm-arm.dvmbm_enabled", early_dvmbm_enable);
+
+static void hardware_enable_dvmbm(void *data)
+{
+	u64 val;
+
+	val  = read_sysreg_s(SYS_LSUDVM_CTRL_EL2);
+	val |= LSUDVM_CTLR_EL2_MASK;
+	write_sysreg_s(val, SYS_LSUDVM_CTRL_EL2);
+}
+
+static void hardware_disable_dvmbm(void *data)
+{
+	u64 val;
+
+	val  = read_sysreg_s(SYS_LSUDVM_CTRL_EL2);
+	val &= ~LSUDVM_CTLR_EL2_MASK;
+	write_sysreg_s(val, SYS_LSUDVM_CTRL_EL2);
+}
+
+bool hisi_dvmbm_supported(void)
+{
+	if (cpu_type != HI_IP09)
+		return false;
+
+	/* Determine whether DVMBM is supported by the hardware */
+	if (!(read_sysreg(aidr_el1) & AIDR_EL1_DVMBM_MASK))
+		return false;
+
+	/* User provided kernel command-line parameter */
+	if (!dvmbm_enabled || !is_kernel_in_hyp_mode()) {
+		on_each_cpu(hardware_disable_dvmbm, NULL, 1);
+		return false;
+	}
+
+	/*
+	 * Enable TLBI Broadcast optimization by setting
+	 * LSUDVM_CTRL_EL2's bit[0].
+	 */
+	on_each_cpu(hardware_enable_dvmbm, NULL, 1);
+	return true;
+}
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index 9231b1dca7f2..f505d44e386f 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -14,7 +14,13 @@ enum hisi_cpu_type {
 	UNKNOWN_HI_TYPE
 };
 
+/* HIP09 */
+#define AIDR_EL1_DVMBM_MASK	GENMASK_ULL(13, 12)
+#define SYS_LSUDVM_CTRL_EL2	sys_reg(3, 4, 15, 7, 4)
+#define LSUDVM_CTLR_EL2_MASK	BIT_ULL(0)
+
 void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
+bool hisi_dvmbm_supported(void);
 
 #endif /* __HISI_VIRT_H__ */
-- 
Gitee


From cc006aa3714fd0678139861777584774f1fcd897 Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Tue, 6 Feb 2024 21:34:58 +0800
Subject: [PATCH 49/59] KVM: arm64: Add kvm_vcpu_arch::sched_cpus and
 pre_sched_cpus

commit 1c342c1a8d0a6ab1fd8a80f785d8f4b8e404d649 openEuler

We already have cpus_ptr in current thread struct now, through which we can
know the pcpu range the thread is allowed to run on. So in
kvm_arch_vcpu_{load,put}, we can also know the pcpu range the vcpu thread
is allowed to be scheduled on, and that is the range we want to configure
for TLBI broadcast.

Introduce two variables sched_cpus and pre_sched_cpus in struct
kvm_vcpu_arch. @sched_cpus always comes from current->cpus_ptr and
@pre_sched_cpus always comes from @sched_cpus.

Signed-off-by: Quan Zhou <zhouquan65@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/kvm_host.h    |  5 ++++
 arch/arm64/kvm/arm.c                 | 14 ++++++++---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 37 ++++++++++++++++++++++++++++
 arch/arm64/kvm/hisilicon/hisi_virt.h | 25 +++++++++++++++++++
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 392c8b5b95f2..ef247b3e5bc9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -649,6 +649,11 @@ struct kvm_vcpu_arch {
 		u64 prod_el2;
 	} hdbss)
 #endif
+#ifdef CONFIG_KVM_HISI_VIRT
+	/* pCPUs this vCPU can be scheduled on. Pure copy of current->cpus_ptr */
+	cpumask_var_t sched_cpus;
+	cpumask_var_t pre_sched_cpus;
+#endif
 };
 
 /*
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b339b3dd79e8..0614cba20bc1 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -51,9 +51,7 @@ static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
 
 DEFINE_STATIC_KEY_FALSE(kvm_rme_is_available);
 
-#ifdef CONFIG_KVM_HISI_VIRT
 #include "hisilicon/hisi_virt.h"
-#endif
 
 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
 
@@ -546,6 +544,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	if (err)
 		return err;
 
+	err = kvm_sched_affinity_vcpu_init(vcpu);
+	if (err)
+		return err;
+
 	err = kvm_share_hyp(vcpu, vcpu + 1);
 	if (err)
 		kvm_vgic_vcpu_destroy(vcpu);
@@ -564,6 +566,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_pmu_vcpu_destroy(vcpu);
 	kvm_vgic_vcpu_destroy(vcpu);
 	kvm_arm_vcpu_destroy(vcpu);
+
+	kvm_sched_affinity_vcpu_destroy(vcpu);
 }
 
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -625,6 +629,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
 		vcpu_set_on_unsupported_cpu(vcpu);
+
+	kvm_tlbi_dvmbm_vcpu_load(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -646,6 +652,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_arm_vmid_clear_active();
 
 	vcpu_clear_on_unsupported_cpu(vcpu);
+
+	kvm_tlbi_dvmbm_vcpu_put(vcpu);
 }
 
 static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
@@ -2767,11 +2775,9 @@ static __init int kvm_arm_init(void)
 		return err;
 	}
 
-#ifdef CONFIG_KVM_HISI_VIRT
 	probe_hisi_cpu_type();
 	kvm_ncsnp_support = hisi_ncsnp_supported();
 	kvm_dvmbm_support = hisi_dvmbm_supported();
-#endif
 	kvm_info("KVM ncsnp %s\n", kvm_ncsnp_support ? "enabled" : "disabled");
 	kvm_info("KVM dvmbm %s\n", kvm_dvmbm_support ? "enabled" : "disabled");
 
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index b81488cd663b..ac12fc54a6b4 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -173,3 +173,40 @@ bool hisi_dvmbm_supported(void)
 	on_each_cpu(hardware_enable_dvmbm, NULL, 1);
 	return true;
 }
+
+int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_dvmbm_support)
+		return 0;
+
+	if (!zalloc_cpumask_var(&vcpu->arch.sched_cpus, GFP_ATOMIC) ||
+	    !zalloc_cpumask_var(&vcpu->arch.pre_sched_cpus, GFP_ATOMIC))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_dvmbm_support)
+		return;
+
+	free_cpumask_var(vcpu->arch.sched_cpus);
+	free_cpumask_var(vcpu->arch.pre_sched_cpus);
+}
+
+void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_dvmbm_support)
+		return;
+
+	cpumask_copy(vcpu->arch.sched_cpus, current->cpus_ptr);
+}
+
+void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_dvmbm_support)
+		return;
+
+	cpumask_copy(vcpu->arch.pre_sched_cpus, vcpu->arch.sched_cpus);
+}
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index f505d44e386f..8d8ef6aa165a 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -6,6 +6,7 @@
 #ifndef __HISI_VIRT_H__
 #define __HISI_VIRT_H__
 
+#ifdef CONFIG_KVM_HISI_VIRT
 enum hisi_cpu_type {
 	HI_1612,
 	HI_1616,
@@ -23,4 +24,28 @@ void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
 bool hisi_dvmbm_supported(void);
 
+int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu);
+void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu);
+void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu);
+#else
+static inline void probe_hisi_cpu_type(void) {}
+static inline bool hisi_ncsnp_supported(void)
+{
+	return false;
+}
+static inline bool hisi_dvmbm_supported(void)
+{
+	return false;
+}
+
+static inline int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+static inline void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu) {}
+static inline void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu) {}
+static inline void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu) {}
+#endif /* CONFIG_KVM_HISI_VIRT */
+
 #endif /* __HISI_VIRT_H__ */
-- 
Gitee


From 08e7402041e347fc3341789700b3a0b6feb8f32c Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Tue, 6 Feb 2024 21:34:59 +0800
Subject: [PATCH 50/59] KVM: arm64: Add kvm_arch::sched_cpus and sched_lock

commit c4ed39bbe2061991f63d29930a47e5cf7d11b2c7 openEuler

Introduce sched_cpus and sched_lock in struct kvm_arch. sched_cpus will
store the union of all vcpus' cpus_ptr in a VM and will be used for the
TLBI broadcast range for this VM. sched_lock ensures a exclusive
manipulation of sched_cpus.

In vcpu_load, we should decide whether to perform the subsequent update
operation by checking whether sched_cpus has changed.

Signed-off-by: Quan Zhou <zhouquan65@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/kvm_host.h    |  4 +++
 arch/arm64/kvm/arm.c                 |  6 ++++
 arch/arm64/kvm/hisilicon/hisi_virt.c | 52 ++++++++++++++++++++++++++++
 arch/arm64/kvm/hisilicon/hisi_virt.h |  7 ++++
 4 files changed, 69 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ef247b3e5bc9..cffe1ecd9ddf 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -313,6 +313,10 @@ struct kvm_arch {
 
 	bool is_realm;
 	struct realm realm;
+#ifdef CONFIG_KVM_HISI_VIRT
+	spinlock_t sched_lock;
+	cpumask_var_t sched_cpus;	/* Union of all vcpu's cpus_ptr */
+#endif
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0614cba20bc1..a57405c2f257 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -248,6 +248,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	int ret;
 
+	ret = kvm_sched_affinity_vm_init(kvm);
+	if (ret)
+		return ret;
+
 	mutex_init(&kvm->arch.config_lock);
 
 #ifdef CONFIG_LOCKDEP
@@ -330,6 +334,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
  */
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_sched_affinity_vm_destroy(kvm);
+
 	bitmap_free(kvm->arch.pmu_filter);
 	free_cpumask_var(kvm->arch.supported_cpus);
 
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index ac12fc54a6b4..fe0515c20989 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -197,10 +197,42 @@ void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *tmp;
+	cpumask_t mask;
+	unsigned long i;
+
+	/* Don't bother on old hardware */
 	if (!kvm_dvmbm_support)
 		return;
 
 	cpumask_copy(vcpu->arch.sched_cpus, current->cpus_ptr);
+
+	if (likely(cpumask_equal(vcpu->arch.sched_cpus,
+				 vcpu->arch.pre_sched_cpus)))
+		return;
+
+	/* Re-calculate sched_cpus for this VM */
+	spin_lock(&kvm->arch.sched_lock);
+
+	cpumask_clear(&mask);
+	kvm_for_each_vcpu(i, tmp, kvm) {
+		/*
+		 * We may get the stale sched_cpus if another thread
+		 * is concurrently changing its affinity. It'll
+		 * eventually go through vcpu_load() and we rely on
+		 * the last sched_lock holder to make things correct.
+		 */
+		cpumask_or(&mask, &mask, tmp->arch.sched_cpus);
+	}
+
+	if (cpumask_equal(kvm->arch.sched_cpus, &mask))
+		goto out_unlock;
+
+	cpumask_copy(kvm->arch.sched_cpus, &mask);
+
+out_unlock:
+	spin_unlock(&kvm->arch.sched_lock);
 }
 
 void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -210,3 +242,23 @@ void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu)
 
 	cpumask_copy(vcpu->arch.pre_sched_cpus, vcpu->arch.sched_cpus);
 }
+
+int kvm_sched_affinity_vm_init(struct kvm *kvm)
+{
+	if (!kvm_dvmbm_support)
+		return 0;
+
+	spin_lock_init(&kvm->arch.sched_lock);
+	if (!zalloc_cpumask_var(&kvm->arch.sched_cpus, GFP_ATOMIC))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_sched_affinity_vm_destroy(struct kvm *kvm)
+{
+	if (!kvm_dvmbm_support)
+		return;
+
+	free_cpumask_var(kvm->arch.sched_cpus);
+}
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index 8d8ef6aa165a..3de270ad2da5 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -26,6 +26,8 @@ bool hisi_dvmbm_supported(void);
 
 int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu);
+int kvm_sched_affinity_vm_init(struct kvm *kvm);
+void kvm_sched_affinity_vm_destroy(struct kvm *kvm);
 void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu);
 void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu);
 #else
@@ -44,6 +46,11 @@ static inline int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu)
 	return 0;
 }
 static inline void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu) {}
+static inline int kvm_sched_affinity_vm_init(struct kvm *kvm)
+{
+	return 0;
+}
+static inline void kvm_sched_affinity_vm_destroy(struct kvm *kvm) {}
 static inline void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu) {}
 static inline void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu) {}
 #endif /* CONFIG_KVM_HISI_VIRT */
-- 
Gitee


From 62af5caacbf2529302620e3ad93cca772110d7de Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Tue, 6 Feb 2024 21:35:00 +0800
Subject: [PATCH 51/59] KVM: arm64: Implement the capability of DVMBM

commit d47f8143fa03bed7a8d74643f070e27c5adc78de openEuler

Implement the capability of DVMBM. Before each vcpu is loaded, we
re-calculate the VM-wide sched_cpus, and if it's changed we will kick all
other vcpus out to reload the latest LSUDVMBM value to the register, and a
new request KVM_REQ_RELOAD_TLBI_DVMBM is added to implement this.

Otherwise if the sched_cpus is not changed by this single vcpu, in order
to ensure the correctness of the contents in the register, we reload the
LSUDVMBM value to the register and nothing else will be done.

Signed-off-by: Quan Zhou <zhouquan65@huawei.com>
Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/include/asm/kvm_host.h    |   2 +
 arch/arm64/kvm/arm.c                 |   3 +
 arch/arm64/kvm/hisilicon/hisi_virt.c | 114 ++++++++++++++++++++++++++-
 arch/arm64/kvm/hisilicon/hisi_virt.h |  29 +++++++
 4 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index cffe1ecd9ddf..d0e05ac1a9db 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -51,6 +51,7 @@
 #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
 #define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
 #define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
+#define KVM_REQ_RELOAD_TLBI_DVMBM	KVM_ARCH_REQ(8)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 				     KVM_DIRTY_LOG_INITIALLY_SET)
@@ -316,6 +317,7 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_HISI_VIRT
 	spinlock_t sched_lock;
 	cpumask_var_t sched_cpus;	/* Union of all vcpu's cpus_ptr */
+	u64 tlbi_dvmbm;
 #endif
 };
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a57405c2f257..b4f1a94779b5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1044,6 +1044,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 
 		if (kvm_dirty_ring_check_request(vcpu))
 			return 0;
+
+		if (kvm_check_request(KVM_REQ_RELOAD_TLBI_DVMBM, vcpu))
+			kvm_hisi_reload_lsudvmbm(vcpu->kvm);
 	}
 
 	return 1;
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index fe0515c20989..662ddf5b124b 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -195,6 +195,96 @@ void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu)
 	free_cpumask_var(vcpu->arch.pre_sched_cpus);
 }
 
+static void __kvm_write_lsudvmbm(struct kvm *kvm)
+{
+	write_sysreg_s(kvm->arch.tlbi_dvmbm, SYS_LSUDVMBM_EL2);
+}
+
+static void kvm_write_lsudvmbm(struct kvm *kvm)
+{
+	spin_lock(&kvm->arch.sched_lock);
+	__kvm_write_lsudvmbm(kvm);
+	spin_unlock(&kvm->arch.sched_lock);
+}
+
+static int kvm_dvmbm_get_dies_info(struct kvm *kvm, u64 *vm_aff3s, int size)
+{
+	int num = 0, cpu;
+
+	for_each_cpu(cpu, kvm->arch.sched_cpus) {
+		bool found = false;
+		u64 aff3;
+		int i;
+
+		if (num >= size)
+			break;
+
+		aff3 = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 3);
+		for (i = 0; i < num; i++) {
+			if (vm_aff3s[i] == aff3) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found)
+			vm_aff3s[num++] = aff3;
+	}
+
+	return num;
+}
+
+static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
+{
+	u64 mpidr, aff3, aff2, aff1;
+	u64 vm_aff3s[DVMBM_MAX_DIES];
+	u64 val;
+	int cpu, nr_dies;
+
+	nr_dies = kvm_dvmbm_get_dies_info(kvm, vm_aff3s, DVMBM_MAX_DIES);
+	if (nr_dies > 2) {
+		val = DVMBM_RANGE_ALL_DIES << DVMBM_RANGE_SHIFT;
+		goto out_update;
+	}
+
+	if (nr_dies == 1) {
+		val = DVMBM_RANGE_ONE_DIE << DVMBM_RANGE_SHIFT	|
+		      vm_aff3s[0] << DVMBM_DIE1_SHIFT;
+
+		/* fulfill bits [52:0] */
+		for_each_cpu(cpu, kvm->arch.sched_cpus) {
+			mpidr = cpu_logical_map(cpu);
+			aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+			aff1 = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
+			val |= 1ULL << (aff2 * 4 + aff1);
+		}
+
+		goto out_update;
+	}
+
+	/* nr_dies == 2 */
+	val = DVMBM_RANGE_TWO_DIES << DVMBM_RANGE_SHIFT	|
+	      DVMBM_GRAN_CLUSTER << DVMBM_GRAN_SHIFT	|
+	      vm_aff3s[0] << DVMBM_DIE1_SHIFT		|
+	      vm_aff3s[1] << DVMBM_DIE2_SHIFT;
+
+	/* and fulfill bits [43:0] */
+	for_each_cpu(cpu, kvm->arch.sched_cpus) {
+		mpidr = cpu_logical_map(cpu);
+		aff3 = MPIDR_AFFINITY_LEVEL(mpidr, 3);
+		aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+
+		if (aff3 == vm_aff3s[0])
+			val |= 1ULL << (aff2 + DVMBM_DIE1_CLUSTER_SHIFT);
+		else
+			val |= 1ULL << (aff2 + DVMBM_DIE2_CLUSTER_SHIFT);
+	}
+
+out_update:
+	kvm->arch.tlbi_dvmbm = val;
+}
+
 void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -209,8 +299,10 @@ void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 	cpumask_copy(vcpu->arch.sched_cpus, current->cpus_ptr);
 
 	if (likely(cpumask_equal(vcpu->arch.sched_cpus,
-				 vcpu->arch.pre_sched_cpus)))
+				 vcpu->arch.pre_sched_cpus))) {
+		kvm_write_lsudvmbm(kvm);
 		return;
+	}
 
 	/* Re-calculate sched_cpus for this VM */
 	spin_lock(&kvm->arch.sched_lock);
@@ -231,7 +323,17 @@ void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 
 	cpumask_copy(kvm->arch.sched_cpus, &mask);
 
+	kvm_flush_remote_tlbs(kvm);
+
+	/*
+	 * Re-calculate LSUDVMBM_EL2 for this VM and kick all vcpus
+	 * out to reload the LSUDVMBM configuration.
+	 */
+	kvm_update_vm_lsudvmbm(kvm);
+	kvm_make_all_cpus_request(kvm, KVM_REQ_RELOAD_TLBI_DVMBM);
+
 out_unlock:
+	__kvm_write_lsudvmbm(kvm);
 	spin_unlock(&kvm->arch.sched_lock);
 }
 
@@ -262,3 +364,13 @@ void kvm_sched_affinity_vm_destroy(struct kvm *kvm)
 
 	free_cpumask_var(kvm->arch.sched_cpus);
 }
+
+void kvm_hisi_reload_lsudvmbm(struct kvm *kvm)
+{
+	if (WARN_ON_ONCE(!kvm_dvmbm_support))
+		return;
+
+	preempt_disable();
+	kvm_write_lsudvmbm(kvm);
+	preempt_enable();
+}
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index 3de270ad2da5..4e162b7f6688 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -20,6 +20,33 @@ enum hisi_cpu_type {
 #define SYS_LSUDVM_CTRL_EL2	sys_reg(3, 4, 15, 7, 4)
 #define LSUDVM_CTLR_EL2_MASK	BIT_ULL(0)
 
+/*
+ * MPIDR_EL1 layout on HIP09
+ *
+ * Aff3[7:3]	- socket ID	[0-15]
+ * Aff3[2:0]	- die ID	[1,3]
+ * Aff2		- cluster ID	[0-9]
+ * Aff1		- core ID	[0-3]
+ * Aff0		- thread ID	[0,1]
+ */
+
+#define SYS_LSUDVMBM_EL2		sys_reg(3, 4, 15, 7, 5)
+#define DVMBM_RANGE_SHIFT		62
+#define DVMBM_RANGE_ONE_DIE		0ULL
+#define DVMBM_RANGE_TWO_DIES		1ULL
+#define DVMBM_RANGE_ALL_DIES		3ULL
+
+#define DVMBM_GRAN_SHIFT		61
+#define DVMBM_GRAN_CLUSTER		0ULL
+#define DVMBM_GRAN_DIE			1ULL
+
+#define DVMBM_DIE1_SHIFT		53
+#define DVMBM_DIE2_SHIFT		45
+#define DVMBM_DIE1_CLUSTER_SHIFT	22
+#define DVMBM_DIE2_CLUSTER_SHIFT	0
+
+#define DVMBM_MAX_DIES			32
+
 void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
 bool hisi_dvmbm_supported(void);
@@ -30,6 +57,7 @@ int kvm_sched_affinity_vm_init(struct kvm *kvm);
 void kvm_sched_affinity_vm_destroy(struct kvm *kvm);
 void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu);
 void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu);
+void kvm_hisi_reload_lsudvmbm(struct kvm *kvm);
 #else
 static inline void probe_hisi_cpu_type(void) {}
 static inline bool hisi_ncsnp_supported(void)
@@ -53,6 +81,7 @@ static inline int kvm_sched_affinity_vm_init(struct kvm *kvm)
 static inline void kvm_sched_affinity_vm_destroy(struct kvm *kvm) {}
 static inline void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu) {}
 static inline void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu) {}
+static inline void kvm_hisi_reload_lsudvmbm(struct kvm *kvm) {}
 #endif /* CONFIG_KVM_HISI_VIRT */
 
 #endif /* __HISI_VIRT_H__ */
-- 
Gitee


From 218e356214fe5ff48410f86981250d4800270875 Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Mon, 1 Apr 2024 10:04:48 +0800
Subject: [PATCH 52/59] KVM: arm64: Translate logic cluster id to physical
 cluster id when updating lsudvmbm

commit da4dd0618a8a8e61d9e44f85eb9934f429411c70 openEuler

For dvmbm feature, MN requires physical cluster id while it is
filled with logic cluster id right now. In some situations which
physical cluster id is not equal to logic cluster id such as in
PG boards, it will cause issues when enabling dvmbm.

To avoid the issue, translate logic cluster id to physical cluster
id when updating lsudvmbm.

Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: caijian <caijian11@h-partners.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/arm.c                 |   3 +
 arch/arm64/kvm/hisilicon/hisi_virt.c | 151 +++++++++++++++++++++++++--
 arch/arm64/kvm/hisilicon/hisi_virt.h |  17 +++
 3 files changed, 164 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b4f1a94779b5..07be7eefc972 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2790,6 +2790,9 @@ static __init int kvm_arm_init(void)
 	kvm_info("KVM ncsnp %s\n", kvm_ncsnp_support ? "enabled" : "disabled");
 	kvm_info("KVM dvmbm %s\n", kvm_dvmbm_support ? "enabled" : "disabled");
 
+	if (kvm_dvmbm_support)
+		kvm_get_pg_cfg();
+
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
 	if (in_hyp_mode)
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 662ddf5b124b..68809f10e8d7 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -234,12 +234,97 @@ static int kvm_dvmbm_get_dies_info(struct kvm *kvm, u64 *vm_aff3s, int size)
 	return num;
 }
 
+static u32 socket_num, die_num;
+
+static u32 kvm_get_socket_num(void)
+{
+	int socket_id[MAX_PG_CFG_SOCKETS], cpu;
+	u32 num = 0;
+
+	for_each_cpu(cpu, cpu_possible_mask) {
+		bool found = false;
+		u64 aff3, socket;
+		int i;
+
+		aff3 = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 3);
+		/* aff3[7:3]: socket ID */
+		socket = (aff3 & SOCKET_ID_MASK) >> SOCKET_ID_SHIFT;
+		for (i = 0; i < num; i++) {
+			if (socket_id[i] == socket) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			socket_id[num++] = socket;
+	}
+	return num;
+}
+
+static u32 kvm_get_die_num(void)
+{
+	int die_id[MAX_DIES_PER_SOCKET], cpu;
+	u32 num = 0;
+
+	for_each_cpu(cpu, cpu_possible_mask) {
+		bool found = false;
+		u64 aff3, die;
+		int i;
+
+		aff3 = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 3);
+		/* aff3[2:0]: die ID */
+		die = aff3 & DIE_ID_MASK;
+		for (i = 0; i < num; i++) {
+			if (die_id[i] == die) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			die_id[num++] = die;
+	}
+	return num;
+}
+
+static u32 g_die_pg[MAX_PG_CFG_SOCKETS * MAX_DIES_PER_SOCKET][MAX_CLUSTERS_PER_DIE];
+
+static void kvm_get_die_pg(unsigned long pg_cfg, int socket_id, int die_id)
+{
+	u32 pg_num = 0, i, j;
+	u32 pg_flag[MAX_CLUSTERS_PER_DIE];
+	u32 die_tmp = socket_id * die_num + die_id;
+
+	for (i = 0; i < MAX_CLUSTERS_PER_DIE; i++) {
+		if (test_bit(i, &pg_cfg))
+			pg_num++;
+		g_die_pg[die_tmp][i] = i;
+		pg_flag[i] = 0;
+	}
+
+	for (i = 0; i < MAX_CLUSTERS_PER_DIE - pg_num; i++) {
+		if (test_bit(i, &pg_cfg)) {
+			for (j = 0; j < pg_num; j++) {
+				u32 cluster_bak = MAX_CLUSTERS_PER_DIE - pg_num + j;
+
+				if (!test_bit(cluster_bak, &pg_cfg) &&
+				    !pg_flag[cluster_bak]) {
+					pg_flag[cluster_bak] = 1;
+					g_die_pg[die_tmp][i] = cluster_bak;
+					g_die_pg[die_tmp][cluster_bak] = i;
+					break;
+				}
+			}
+		}
+	}
+}
+
 static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
 {
-	u64 mpidr, aff3, aff2, aff1;
+	u64 mpidr, aff3, aff2, aff1, phy_aff2;
 	u64 vm_aff3s[DVMBM_MAX_DIES];
 	u64 val;
 	int cpu, nr_dies;
+	u32 socket_id, die_id;
 
 	nr_dies = kvm_dvmbm_get_dies_info(kvm, vm_aff3s, DVMBM_MAX_DIES);
 	if (nr_dies > 2) {
@@ -254,10 +339,18 @@ static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
 		/* fulfill bits [52:0] */
 		for_each_cpu(cpu, kvm->arch.sched_cpus) {
 			mpidr = cpu_logical_map(cpu);
+			aff3 = MPIDR_AFFINITY_LEVEL(mpidr, 3);
 			aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
 			aff1 = MPIDR_AFFINITY_LEVEL(mpidr, 1);
-
-			val |= 1ULL << (aff2 * 4 + aff1);
+			socket_id = (aff3 & SOCKET_ID_MASK) >> SOCKET_ID_SHIFT;
+			die_id = (aff3 & DIE_ID_MASK) >> DIE_ID_SHIFT;
+			if (die_id == TOTEM_B_ID)
+				die_id = 0;
+			else
+				die_id = 1;
+
+			phy_aff2 = g_die_pg[socket_id * die_num + die_id][aff2];
+			val |= 1ULL << (phy_aff2 * 4 + aff1);
 		}
 
 		goto out_update;
@@ -274,11 +367,20 @@ static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
 		mpidr = cpu_logical_map(cpu);
 		aff3 = MPIDR_AFFINITY_LEVEL(mpidr, 3);
 		aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
-
-		if (aff3 == vm_aff3s[0])
-			val |= 1ULL << (aff2 + DVMBM_DIE1_CLUSTER_SHIFT);
+		socket_id = (aff3 & SOCKET_ID_MASK) >> SOCKET_ID_SHIFT;
+		die_id = (aff3 & DIE_ID_MASK) >> DIE_ID_SHIFT;
+		if (die_id == TOTEM_B_ID)
+			die_id = 0;
 		else
-			val |= 1ULL << (aff2 + DVMBM_DIE2_CLUSTER_SHIFT);
+			die_id = 1;
+
+		if (aff3 == vm_aff3s[0]) {
+			phy_aff2 = g_die_pg[socket_id * die_num + die_id][aff2];
+			val |= 1ULL << (phy_aff2 + DVMBM_DIE1_CLUSTER_SHIFT);
+		} else {
+			phy_aff2 = g_die_pg[socket_id * die_num + die_id][aff2];
+			val |= 1ULL << (phy_aff2 + DVMBM_DIE2_CLUSTER_SHIFT);
+		}
 	}
 
 out_update:
@@ -345,6 +447,41 @@ void kvm_tlbi_dvmbm_vcpu_put(struct kvm_vcpu *vcpu)
 	cpumask_copy(vcpu->arch.pre_sched_cpus, vcpu->arch.sched_cpus);
 }
 
+void kvm_get_pg_cfg(void)
+{
+	void __iomem *mn_base;
+	u32 i, j;
+	u32 pg_cfgs[MAX_PG_CFG_SOCKETS * MAX_DIES_PER_SOCKET];
+	u64 mn_phy_base;
+	u32 val;
+
+	socket_num = kvm_get_socket_num();
+	die_num = kvm_get_die_num();
+
+	for (i = 0; i < socket_num; i++) {
+		for (j = 0; j < die_num; j++) {
+
+			/*
+			 * totem B means the first CPU DIE within a SOCKET,
+			 * totem A means the second one.
+			 */
+			mn_phy_base = (j == 0) ? TB_MN_BASE : TA_MN_BASE;
+			mn_phy_base += CHIP_ADDR_OFFSET(i);
+			mn_phy_base += MN_ECO0_OFFSET;
+
+			mn_base = ioremap(mn_phy_base, 4);
+			if (!mn_base) {
+				kvm_info("MN base addr ioremap failed\n");
+				return;
+			}
+			val = readl_relaxed(mn_base);
+			pg_cfgs[j + i * die_num] = val & 0xff;
+			kvm_get_die_pg(pg_cfgs[j + i * die_num], i, j);
+			iounmap(mn_base);
+		}
+	}
+}
+
 int kvm_sched_affinity_vm_init(struct kvm *kvm)
 {
 	if (!kvm_dvmbm_support)
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index 4e162b7f6688..31bcb62235a4 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -20,6 +20,21 @@ enum hisi_cpu_type {
 #define SYS_LSUDVM_CTRL_EL2	sys_reg(3, 4, 15, 7, 4)
 #define LSUDVM_CTLR_EL2_MASK	BIT_ULL(0)
 
+#define MAX_CLUSTERS_PER_DIE 8
+#define TB_MN_BASE 0x00C6067f0000
+#define TA_MN_BASE 0x0046067F0000
+#define CHIP_ADDR_OFFSET(_chip)        (((((_chip) >> 3) & 0x1) * 0x80000000000) + \
+	((((_chip) >> 2) & 0x1) * (0x100000000000)) + \
+	(((_chip) & 0x3) * 0x200000000000))
+#define MAX_PG_CFG_SOCKETS 4
+#define MAX_DIES_PER_SOCKET 2
+#define MN_ECO0_OFFSET 0xc00
+#define SOCKET_ID_MASK 0xf8
+#define SOCKET_ID_SHIFT 3
+#define DIE_ID_MASK 0x7
+#define DIE_ID_SHIFT 0
+#define TOTEM_B_ID 3
+
 /*
  * MPIDR_EL1 layout on HIP09
  *
@@ -50,6 +65,7 @@ enum hisi_cpu_type {
 void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
 bool hisi_dvmbm_supported(void);
+void kvm_get_pg_cfg(void);
 
 int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_sched_affinity_vcpu_destroy(struct kvm_vcpu *vcpu);
@@ -68,6 +84,7 @@ static inline bool hisi_dvmbm_supported(void)
 {
 	return false;
 }
+static inline void kvm_get_pg_cfg(void) {}
 
 static inline int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu)
 {
-- 
Gitee


From 17c88c8ba52abbbed1b613941ab4a59c7004e67c Mon Sep 17 00:00:00 2001
From: Zhou Wang <wangzhou1@hisilicon.com>
Date: Tue, 8 Oct 2024 19:21:36 +0800
Subject: [PATCH 53/59] KVM: arm64: Add new HiSi CPU type for supporting DVMBM

commit 0fb85a6177b5c8cc0d499502dbeb1111c3751ec5 openEuler

Add new HiSi CPU type for supporting DVMBM, and expand ACPI
hisi oem table id string to 8 bit.

Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: caijian <caijian11@h-partners.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 16 ++++++++++------
 arch/arm64/kvm/hisilicon/hisi_virt.h |  6 ++++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 68809f10e8d7..ea6ab834a46e 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -18,15 +18,19 @@ static const char * const hisi_cpu_type_str[] = {
 	"Hisi1616",
 	"Hisi1620",
 	"HIP09",
+	"HIP10",
+	"HIP10C",
 	"Unknown"
 };
 
 /* ACPI Hisi oem table id str */
 static const char * const oem_str[] = {
-	"HIP06",	/* Hisi 1612 */
-	"HIP07",	/* Hisi 1616 */
-	"HIP08",	/* Hisi 1620 */
-	"HIP09"		/* HIP09 */
+	"HIP06   ",	/* Hisi 1612 */
+	"HIP07   ",	/* Hisi 1616 */
+	"HIP08   ",	/* Hisi 1620 */
+	"HIP09   ",	/* HIP09 */
+	"HIP10   ",	/* HIP10 */
+	"HIP10C  "	/* HIP10C */
 };
 
 /*
@@ -47,7 +51,7 @@ static enum hisi_cpu_type acpi_get_hisi_cpu_type(void)
 	}
 
 	for (i = 0; i < str_size; ++i) {
-		if (!strncmp(oem_str[i], table->oem_table_id, 5))
+		if (!strncmp(oem_str[i], table->oem_table_id, 8))
 			return i;
 	}
 
@@ -153,7 +157,7 @@ static void hardware_disable_dvmbm(void *data)
 
 bool hisi_dvmbm_supported(void)
 {
-	if (cpu_type != HI_IP09)
+	if (cpu_type != HI_IP10 && cpu_type != HI_IP10C)
 		return false;
 
 	/* Determine whether DVMBM is supported by the hardware */
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index 31bcb62235a4..e3b006343ead 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -12,10 +12,12 @@ enum hisi_cpu_type {
 	HI_1616,
 	HI_1620,
 	HI_IP09,
+	HI_IP10,
+	HI_IP10C,
 	UNKNOWN_HI_TYPE
 };
 
-/* HIP09 */
+/* HIP10 */
 #define AIDR_EL1_DVMBM_MASK	GENMASK_ULL(13, 12)
 #define SYS_LSUDVM_CTRL_EL2	sys_reg(3, 4, 15, 7, 4)
 #define LSUDVM_CTLR_EL2_MASK	BIT_ULL(0)
@@ -36,7 +38,7 @@ enum hisi_cpu_type {
 #define TOTEM_B_ID 3
 
 /*
- * MPIDR_EL1 layout on HIP09
+ * MPIDR_EL1 layout on HIP10
  *
  * Aff3[7:3]	- socket ID	[0-15]
  * Aff3[2:0]	- die ID	[1,3]
-- 
Gitee


From b994d2506b5bb59e5763dc35eaab2554b6ba8898 Mon Sep 17 00:00:00 2001
From: yangjinqian <yangjinqian1@huawei.com>
Date: Thu, 27 Mar 2025 15:55:53 +0800
Subject: [PATCH 54/59] kvm: hisi_virt: fix kernel panic when enable DVMBM in
 nVHE

commit b7bcdd9e486ea7c630822750a5bb2f3012755616 openEuler

When the kernel is in nvhe mode and is in EL1, the original judgment
logic causes the hardware_disable_dvmbm function to read the EL2
register in EL1, causing a panic during kernel startup.

Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index ea6ab834a46e..9cac8947af6d 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -160,12 +160,17 @@ bool hisi_dvmbm_supported(void)
 	if (cpu_type != HI_IP10 && cpu_type != HI_IP10C)
 		return false;
 
+	if (!is_kernel_in_hyp_mode()) {
+		kvm_info("Hisi dvmbm not supported by KVM nVHE mode\n");
+		return false;
+	}
+
 	/* Determine whether DVMBM is supported by the hardware */
 	if (!(read_sysreg(aidr_el1) & AIDR_EL1_DVMBM_MASK))
 		return false;
 
 	/* User provided kernel command-line parameter */
-	if (!dvmbm_enabled || !is_kernel_in_hyp_mode()) {
+	if (!dvmbm_enabled) {
 		on_each_cpu(hardware_disable_dvmbm, NULL, 1);
 		return false;
 	}
-- 
Gitee


From dc0d5b8ccad918950893f667d2f15b19e7a1ef8f Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Thu, 27 Mar 2025 15:55:54 +0800
Subject: [PATCH 55/59] kvm: hisi_virt: Update TLBI broadcast feature for hip12

commit 2671ba2219683b30ced931875147193cbdd69aab openEuler

Compared with hip09, there are some differences on TLBI broadcast
feature for hip12 including:
- No need to translate logical cluster id to physical cluster id;
- The minimum granularity of TLBI broadcast is cluster;
- Some fields of register LSUDVMBM changes;

So update for corresponding changes.

MPIDR_EL1 layout on HIP12:
Aff3[3:2]	- socket ID	[0-3]
Aff3[1:0]	- die ID	[0,1]
Aff2		- cluster ID	[0-5]
Aff1		- core ID	[0-15]
Aff0		- thread ID	[0,1]

Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 59 +++++++++++++++++++++++++++-
 arch/arm64/kvm/hisilicon/hisi_virt.h |  8 ++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 9cac8947af6d..728b3b61dc94 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -20,6 +20,7 @@ static const char * const hisi_cpu_type_str[] = {
 	"HIP09",
 	"HIP10",
 	"HIP10C",
+	"HIP12",
 	"Unknown"
 };
 
@@ -30,7 +31,8 @@ static const char * const oem_str[] = {
 	"HIP08   ",	/* Hisi 1620 */
 	"HIP09   ",	/* HIP09 */
 	"HIP10   ",	/* HIP10 */
-	"HIP10C  "	/* HIP10C */
+	"HIP10C  ",	/* HIP10C */
+	"HIP12   "	/* HIP12 */
 };
 
 /*
@@ -396,6 +398,56 @@ static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
 	kvm->arch.tlbi_dvmbm = val;
 }
 
+static void kvm_update_vm_lsudvmbm_hip12(struct kvm *kvm)
+{
+	u64 mpidr, aff3, aff2;
+	u64 vm_aff3s[DVMBM_MAX_DIES_HIP12];
+	u64 val;
+	int cpu, nr_dies;
+
+	nr_dies = kvm_dvmbm_get_dies_info(kvm, vm_aff3s, DVMBM_MAX_DIES_HIP12);
+	if (nr_dies > 2) {
+		val = DVMBM_RANGE_ALL_DIES << DVMBM_RANGE_SHIFT;
+		goto out_update;
+	}
+
+	if (nr_dies == 1) {
+		val = DVMBM_RANGE_ONE_DIE << DVMBM_RANGE_SHIFT	|
+		      vm_aff3s[0] << DVMBM_DIE1_VDIE_SHIFT_HIP12;
+
+		/* fulfill bits [11:6] */
+		for_each_cpu(cpu, kvm->arch.sched_cpus) {
+			mpidr = cpu_logical_map(cpu);
+			aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+
+			val |= 1ULL << (aff2 + DVMBM_DIE1_CLUSTER_SHIFT_HIP12);
+		}
+
+		goto out_update;
+	}
+
+	/* nr_dies == 2 */
+	val = DVMBM_RANGE_TWO_DIES << DVMBM_RANGE_SHIFT	|
+	      DVMBM_GRAN_CLUSTER << DVMBM_GRAN_SHIFT	|
+	      vm_aff3s[0] << DVMBM_DIE1_VDIE_SHIFT_HIP12    |
+	      vm_aff3s[1] << DVMBM_DIE2_VDIE_SHIFT_HIP12;
+
+	/* and fulfill bits [11:0] */
+	for_each_cpu(cpu, kvm->arch.sched_cpus) {
+		mpidr = cpu_logical_map(cpu);
+		aff3 = MPIDR_AFFINITY_LEVEL(mpidr, 3);
+		aff2 = MPIDR_AFFINITY_LEVEL(mpidr, 2);
+
+		if (aff3 == vm_aff3s[0])
+			val |= 1ULL << (aff2 + DVMBM_DIE1_CLUSTER_SHIFT_HIP12);
+		else
+			val |= 1ULL << (aff2 + DVMBM_DIE2_CLUSTER_SHIFT_HIP12);
+	}
+
+out_update:
+	kvm->arch.tlbi_dvmbm = val;
+}
+
 void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -440,7 +492,10 @@ void kvm_tlbi_dvmbm_vcpu_load(struct kvm_vcpu *vcpu)
 	 * Re-calculate LSUDVMBM_EL2 for this VM and kick all vcpus
 	 * out to reload the LSUDVMBM configuration.
 	 */
-	kvm_update_vm_lsudvmbm(kvm);
+	if (cpu_type == HI_IP12)
+		kvm_update_vm_lsudvmbm_hip12(kvm);
+	else
+		kvm_update_vm_lsudvmbm(kvm);
 	kvm_make_all_cpus_request(kvm, KVM_REQ_RELOAD_TLBI_DVMBM);
 
 out_unlock:
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index e3b006343ead..c45d319e7b41 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -14,6 +14,7 @@ enum hisi_cpu_type {
 	HI_IP09,
 	HI_IP10,
 	HI_IP10C,
+	HI_IP12,
 	UNKNOWN_HI_TYPE
 };
 
@@ -64,6 +65,13 @@ enum hisi_cpu_type {
 
 #define DVMBM_MAX_DIES			32
 
+/* HIP12 */
+#define DVMBM_DIE1_VDIE_SHIFT_HIP12		57
+#define DVMBM_DIE2_VDIE_SHIFT_HIP12		53
+#define DVMBM_DIE1_CLUSTER_SHIFT_HIP12		6
+#define DVMBM_DIE2_CLUSTER_SHIFT_HIP12		0
+#define DVMBM_MAX_DIES_HIP12				8
+
 void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
 bool hisi_dvmbm_supported(void);
-- 
Gitee


From f008a6fa6d040a52bbc15b634ab9b7183a744401 Mon Sep 17 00:00:00 2001
From: yangjinqian <yangjinqian1@huawei.com>
Date: Thu, 27 Mar 2025 15:55:55 +0800
Subject: [PATCH 56/59] KVM: arm64: Add new HiSi CPU type to support DVMBM

commit 0fb85a6177b5c8cc0d499502dbeb1111c3751ec5 openEuler

Add new HiSi CPU type HIP12 for supporting DVMBM.
Function kvm_get_pg_cfg() is used to get configuration
for translating logic cluster id to physical cluster id
which is not needed by hip12, so skip it for hip12.

Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 728b3b61dc94..d95b96ee7237 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -159,7 +159,8 @@ static void hardware_disable_dvmbm(void *data)
 
 bool hisi_dvmbm_supported(void)
 {
-	if (cpu_type != HI_IP10 && cpu_type != HI_IP10C)
+	if (cpu_type != HI_IP10 && cpu_type != HI_IP10C &&
+	    cpu_type != HI_IP12)
 		return false;
 
 	if (!is_kernel_in_hyp_mode()) {
@@ -519,6 +520,9 @@ void kvm_get_pg_cfg(void)
 	u64 mn_phy_base;
 	u32 val;
 
+	if (cpu_type == HI_IP12)
+		return;
+
 	socket_num = kvm_get_socket_num();
 	die_num = kvm_get_die_num();
 
-- 
Gitee


From b2e9dc674373d48359100fedeefa07a6b612bf5a Mon Sep 17 00:00:00 2001
From: Zhou Wang <wangzhou1@hisilicon.com>
Date: Mon, 20 Oct 2025 21:30:02 +0800
Subject: [PATCH 57/59] KVM: hisi_virt: tlbi: Fix wrong CPU aff3 conversion
 between MPIDR and SYS_LSUDVMBM_EL2

commit b7334ad8c897eaa438dfab360e1213b7817d62a4 openEuler

TLBI broadcast CPU bitmap should be set in SYS_LSUDVMBM_EL2. Now we make a
mistake when doing the conversion between MPIDR and SYS_LSUDVMBM_EL2.

Fields of Die ID and Socket ID in Aff3 are different between MPIDR and
SYS_LSUDVMBM_EL2 in HIP12, however, they are same in current wrong logic.

This patch fixes this problem.

Fixes: ("kvm: hisi_virt: Update TLBI broadcast feature for hip12")
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Jian Cai <caijian11@h-partners.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 25 ++++++++++++++++++++++---
 arch/arm64/kvm/hisilicon/hisi_virt.h | 25 ++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index d95b96ee7237..21e616b8806c 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/kvm_host.h>
 #include "hisi_virt.h"
+#include <linux/bitfield.h>
 
 static enum hisi_cpu_type cpu_type = UNKNOWN_HI_TYPE;
 
@@ -399,12 +400,27 @@ static void kvm_update_vm_lsudvmbm(struct kvm *kvm)
 	kvm->arch.tlbi_dvmbm = val;
 }
 
+static u64 convert_aff3_to_die_hip12(u64 aff3)
+{
+	/*
+	 * On HIP12, we use 4 bits to represent a die in SYS_LSUDVMBM_EL2.
+	 *
+	 * die1: socket ID (bits[60:59]) + die ID (bits[58:57])
+	 * die2: socket ID (bits[56:55]) + die ID (bits[54:53])
+	 *
+	 * We therefore need to properly encode Aff3 into it.
+	 */
+	return FIELD_GET(MPIDR_AFF3_SOCKET_ID_MASK, aff3) << 2 |
+	       FIELD_GET(MPIDR_AFF3_DIE_ID_MASK, aff3);
+}
+
 static void kvm_update_vm_lsudvmbm_hip12(struct kvm *kvm)
 {
 	u64 mpidr, aff3, aff2;
 	u64 vm_aff3s[DVMBM_MAX_DIES_HIP12];
 	u64 val;
 	int cpu, nr_dies;
+	u64 die1, die2;
 
 	nr_dies = kvm_dvmbm_get_dies_info(kvm, vm_aff3s, DVMBM_MAX_DIES_HIP12);
 	if (nr_dies > 2) {
@@ -413,8 +429,9 @@ static void kvm_update_vm_lsudvmbm_hip12(struct kvm *kvm)
 	}
 
 	if (nr_dies == 1) {
+		die1 = convert_aff3_to_die_hip12(vm_aff3s[0]);
 		val = DVMBM_RANGE_ONE_DIE << DVMBM_RANGE_SHIFT	|
-		      vm_aff3s[0] << DVMBM_DIE1_VDIE_SHIFT_HIP12;
+		      die1 << DVMBM_DIE1_SHIFT_HIP12;
 
 		/* fulfill bits [11:6] */
 		for_each_cpu(cpu, kvm->arch.sched_cpus) {
@@ -428,10 +445,12 @@ static void kvm_update_vm_lsudvmbm_hip12(struct kvm *kvm)
 	}
 
 	/* nr_dies == 2 */
+	die1 = convert_aff3_to_die_hip12(vm_aff3s[0]);
+	die2 = convert_aff3_to_die_hip12(vm_aff3s[1]);
 	val = DVMBM_RANGE_TWO_DIES << DVMBM_RANGE_SHIFT	|
 	      DVMBM_GRAN_CLUSTER << DVMBM_GRAN_SHIFT	|
-	      vm_aff3s[0] << DVMBM_DIE1_VDIE_SHIFT_HIP12    |
-	      vm_aff3s[1] << DVMBM_DIE2_VDIE_SHIFT_HIP12;
+	      die1 << DVMBM_DIE1_SHIFT_HIP12		|
+	      die2 << DVMBM_DIE2_SHIFT_HIP12;
 
 	/* and fulfill bits [11:0] */
 	for_each_cpu(cpu, kvm->arch.sched_cpus) {
diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.h b/arch/arm64/kvm/hisilicon/hisi_virt.h
index c45d319e7b41..c57ca65970de 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.h
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.h
@@ -39,7 +39,7 @@ enum hisi_cpu_type {
 #define TOTEM_B_ID 3
 
 /*
- * MPIDR_EL1 layout on HIP10
+ * MPIDR_EL1 layout on HIP10/HIP10C
  *
  * Aff3[7:3]	- socket ID	[0-15]
  * Aff3[2:0]	- die ID	[1,3]
@@ -65,12 +65,27 @@ enum hisi_cpu_type {
 
 #define DVMBM_MAX_DIES			32
 
-/* HIP12 */
-#define DVMBM_DIE1_VDIE_SHIFT_HIP12		57
-#define DVMBM_DIE2_VDIE_SHIFT_HIP12		53
+/*
+ * MPIDR_EL1 layout on HIP12
+ *
+ * Aff3[4:3]	- socket ID	[0-3]
+ * Aff3[2:0]	- die ID	[0,1]
+ * Aff2[2:0]	- cluster ID	[0-5]
+ * Aff1[3:0]	- core ID	[0-15]
+ * Aff0[0]	- thread ID	[0,1]
+ *
+ * On HIP12, cpu die is named as vdie. Actually,
+ * vdie is equivalent to cpu die. Here use die
+ * to describe vdie.
+ */
+
+#define MPIDR_AFF3_SOCKET_ID_MASK		GENMASK(4, 3)
+#define MPIDR_AFF3_DIE_ID_MASK			GENMASK(2, 0)
+#define DVMBM_DIE1_SHIFT_HIP12			57
+#define DVMBM_DIE2_SHIFT_HIP12			53
 #define DVMBM_DIE1_CLUSTER_SHIFT_HIP12		6
 #define DVMBM_DIE2_CLUSTER_SHIFT_HIP12		0
-#define DVMBM_MAX_DIES_HIP12				8
+#define DVMBM_MAX_DIES_HIP12			8
 
 void probe_hisi_cpu_type(void);
 bool hisi_ncsnp_supported(void);
-- 
Gitee


From 12ce0de51654a2d4eee0e35390c4fdf18a4078a4 Mon Sep 17 00:00:00 2001
From: Jinqian Yang <yangjinqian1@huawei.com>
Date: Fri, 30 May 2025 11:25:30 +0800
Subject: [PATCH 58/59] KVM: arm64: fix memory leak in TLBI

commit 18c863e221099a80a33658a439f177107aa19f68 openEuler

If sched_cpus is successfully allocated but pre_sched_cpus fails
to be allocated, the memory of the former is not released.

Fixes:  ("KVM: arm64: Add kvm_vcpu_arch::sched_cpus and pre_sched_cpus")
Signed-off-by: Jinqian Yang <yangjinqian1@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kvm/hisilicon/hisi_virt.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hisilicon/hisi_virt.c b/arch/arm64/kvm/hisilicon/hisi_virt.c
index 21e616b8806c..cf1e8fe3f4c4 100644
--- a/arch/arm64/kvm/hisilicon/hisi_virt.c
+++ b/arch/arm64/kvm/hisilicon/hisi_virt.c
@@ -192,10 +192,14 @@ int kvm_sched_affinity_vcpu_init(struct kvm_vcpu *vcpu)
 	if (!kvm_dvmbm_support)
 		return 0;
 
-	if (!zalloc_cpumask_var(&vcpu->arch.sched_cpus, GFP_ATOMIC) ||
-	    !zalloc_cpumask_var(&vcpu->arch.pre_sched_cpus, GFP_ATOMIC))
+	if (!zalloc_cpumask_var(&vcpu->arch.sched_cpus, GFP_ATOMIC))
 		return -ENOMEM;
 
+	if (!zalloc_cpumask_var(&vcpu->arch.pre_sched_cpus, GFP_ATOMIC)) {
+		free_cpumask_var(vcpu->arch.sched_cpus);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
-- 
Gitee


From 51a87c76cb5747a100f76e11a13b2ac3c192609b Mon Sep 17 00:00:00 2001
From: lishusen <lishusen2@huawei.com>
Date: Thu, 21 Mar 2024 12:18:40 +0800
Subject: [PATCH 59/59] arm64: Delete macro in the scsnp feature

commit 6d800bea46f46d11241eff6362d2c04fdcab4eac openEuler
The macro in the scsnp feature is deleted to rectify the compilation
problem.

Signed-off-by: lishusen <lishusen2@huawei.com>
Signed-off-by: Xie Xiaodong <624338359@qq.com>
---
 arch/arm64/kernel/image-vars.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 82957f93fa21..ef942592d767 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -111,10 +111,8 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
 /* pKVM static key */
 KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
 
-#ifdef CONFIG_KVM_HISI_VIRT
 /* Capability of non-cacheable snooping */
 KVM_NVHE_ALIAS(kvm_ncsnp_support);
-#endif
 
 #endif /* CONFIG_KVM */
 
-- 
Gitee