diff --git a/README.md b/README.md index 642183cc5a2135dde16e6c8d6cc919f1ab53853b..8e7d53711fed34fdb99eeb37abb490e80a597bd3 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ | CSPResNet50 | FP16 | [✅](models/cv/classification/cspresnet50/igie) | [✅](models/cv/classification/cspresnet50/ixrt) | 4.3.0 | | | INT8 | | [✅](models/cv/classification/cspresnet50/ixrt) | 4.3.0 | | CSPResNeXt50 | FP16 | [✅](models/cv/classification/cspresnext50/igie) | [✅](models/cv/classification/cspresnext50/ixrt) | 4.3.0 | +| DeiT-B | FP16 | | [✅](models/cv/classification/deit_b/ixrt) | 4.4.0 | | DeiT-tiny | FP16 | [✅](models/cv/classification/deit_tiny/igie) | [✅](models/cv/classification/deit_tiny/ixrt) | 4.3.0 | | DenseNet121 | FP16 | [✅](models/cv/classification/densenet121/igie) | [✅](models/cv/classification/densenet121/ixrt) | 4.3.0 | | | INT8 | [✅](models/cv/classification/densenet121/igie) | | 4.4.0 | @@ -152,6 +153,7 @@ | MNASNet0_75 | FP16 | [✅](models/cv/classification/mnasnet0_75/igie) | | 4.3.0 | | MNASNet1_0 | FP16 | [✅](models/cv/classification/mnasnet1_0/igie) | | 4.3.0 | | MNASNet1_3 | FP16 | [✅](models/cv/classification/mnasnet1_3/igie) | | 4.3.0 | +| MobileNetV1 | FP16 | | [✅](models/cv/classification/mobilenet_v1/ixrt) | 4.4.0 | | MobileNetV2 | FP16 | [✅](models/cv/classification/mobilenet_v2/igie) | [✅](models/cv/classification/mobilenet_v2/ixrt) | 4.3.0 | | | INT8 | [✅](models/cv/classification/mobilenet_v2/igie) | [✅](models/cv/classification/mobilenet_v2/ixrt) | 4.3.0 | | MobileNetV3_Large | FP16 | [✅](models/cv/classification/mobilenet_v3_large/igie) | | 4.3.0 | @@ -211,7 +213,7 @@ | | INT8 | [✅](models/cv/classification/vgg16/igie) | | 4.3.0 | | VGG19 | FP16 | [✅](models/cv/classification/vgg19/igie) | | 4.3.0 | | VGG19_BN | FP16 | [✅](models/cv/classification/vgg19_bn/igie) | | 4.3.0 | -| ViT | FP16 | [✅](models/cv/classification/vit/igie) | | 4.3.0 | +| ViT | FP16 | [✅](models/cv/classification/vit/igie) | [✅](models/cv/classification/vit/ixit) | 4.3.0 | | ViT-B-32 | FP16 | [✅](models/cv/classification/vit_b_32/igie) | | 4.4.0 | | ViT-L-14 | FP16 | [✅](models/cv/classification/vit_l_14/igie) | | 4.4.0 | | Wide ResNet50 | FP16 | [✅](models/cv/classification/wide_resnet50/igie) | [✅](models/cv/classification/wide_resnet50/ixrt) | 4.3.0 | @@ -293,9 +295,10 @@ | Model | Prec. | IGIE | ixRT | IXUCA SDK | |---------------|-------|---------------------------------------|---------------------------------------|-----------| +| CRNN | FP16 | | [✅](models/cv/ocr/crnn/ixrt) | 4.4.0 | +| DBNet | FP16 | | [✅](models/cv/ocr/dbnet/ixrt) | 4.4.0 | | Kie_layoutXLM | FP16 | [✅](models/cv/ocr/kie_layoutxlm/igie) | | 4.3.0 | | SVTR | FP16 | [✅](models/cv/ocr/svtr/igie) | | 4.3.0 | -| CRNN | FP16 | | [✅](models/cv/ocr/crnn/ixrt) | 4.4.0 | #### 姿态估计 @@ -316,6 +319,7 @@ | Model | Prec. | IGIE | ixRT | IXUCA SDK | |-------|-------|------------------------------------------------|------------------------------------------------|-----------| +| DDRNet | FP16 | | [✅](models/cv/semantic_segmentation/ddrnet/ixrt) | 4.4.0 | | UNet | FP16 | [✅](models/cv/semantic_segmentation/unet/igie) | [✅](models/cv/semantic_segmentation/unet/ixrt) | 4.3.0 | #### 多目标跟踪 @@ -377,6 +381,7 @@ | Model | Prec. | IGIE | ixRT | IXUCA SDK | |-----------------|-------|-----------------------------------------------------|-----------------------------------------------------------|-----------| | Conformer | FP16 | [✅](models/audio/speech_recognition/conformer/igie) | [✅](models/audio/speech_recognition/conformer/ixrt) | 4.3.0 | +| DeepSpeech2 | FP16 | | [✅](models/speech/speech_recognition/deepspeech2/ixrt) | 4.4.0 | | Transformer ASR | FP16 | | [✅](models/audio/speech_recognition/transformer_asr/ixrt) | 4.2.0 | ### 其他 diff --git a/README_en.md b/README_en.md index af9c0e6e2ff3c5ff46286591c8630d124f0ba2c0..0b55d2b038a0e7c4da557767ab61a3ae3e5519a5 100644 --- a/README_en.md +++ b/README_en.md @@ -111,6 +111,7 @@ inference to be expanded in the future. | CSPResNet50 | FP16 | [✅](models/cv/classification/cspresnet50/igie) | [✅](models/cv/classification/cspresnet50/ixrt) | 4.3.0 | | | INT8 | | [✅](models/cv/classification/cspresnet50/ixrt) | 4.3.0 | | CSPResNeXt50 | FP16 | [✅](models/cv/classification/cspresnext50/igie) | [✅](models/cv/classification/cspresnext50/ixrt) | 4.3.0 | +| DeiT-B | FP16 | | [✅](models/cv/classification/deit_b/ixrt) | 4.4.0 | | DeiT-tiny | FP16 | [✅](models/cv/classification/deit_tiny/igie) | [✅](models/cv/classification/deit_tiny/ixrt) | 4.3.0 | | DenseNet121 | FP16 | [✅](models/cv/classification/densenet121/igie) | [✅](models/cv/classification/densenet121/ixrt) | 4.3.0 | | | INT8 | [✅](models/cv/classification/densenet121/igie) | | 4.4.0 | @@ -144,6 +145,7 @@ inference to be expanded in the future. | MNASNet0_75 | FP16 | [✅](models/cv/classification/mnasnet0_75/igie) | | 4.3.0 | | MNASNet1_0 | FP16 | [✅](models/cv/classification/mnasnet1_0/igie) | | 4.3.0 | | MNASNet1_3 | FP16 | [✅](models/cv/classification/mnasnet1_3/igie) | | 4.3.0 | +| MobileNetV1 | FP16 | | [✅](models/cv/classification/mobilenet_v1/ixrt) | 4.4.0 | | MobileNetV2 | FP16 | [✅](models/cv/classification/mobilenet_v2/igie) | [✅](models/cv/classification/mobilenet_v2/ixrt) | 4.3.0 | | | INT8 | [✅](models/cv/classification/mobilenet_v2/igie) | [✅](models/cv/classification/mobilenet_v2/ixrt) | 4.3.0 | | MobileNetV3_Large | FP16 | [✅](models/cv/classification/mobilenet_v3_large/igie) | | 4.3.0 | @@ -203,7 +205,7 @@ inference to be expanded in the future. | | INT8 | [✅](models/cv/classification/vgg16/igie) | | 4.3.0 | | VGG19 | FP16 | [✅](models/cv/classification/vgg19/igie) | | 4.3.0 | | VGG19_BN | FP16 | [✅](models/cv/classification/vgg19_bn/igie) | | 4.3.0 | -| ViT | FP16 | [✅](models/cv/classification/vit/igie) | | 4.3.0 | +| ViT | FP16 | [✅](models/cv/classification/vit/igie) | [✅](models/cv/classification/vit/ixit) | 4.3.0 | | ViT-B-32 | FP16 | [✅](models/cv/classification/vit_b_32/igie) | | 4.4.0 | | ViT-L-14 | FP16 | [✅](models/cv/classification/vit_l_14/igie) | | 4.4.0 | | Wide ResNet50 | FP16 | [✅](models/cv/classification/wide_resnet50/igie) | [✅](models/cv/classification/wide_resnet50/ixrt) | 4.3.0 | @@ -284,9 +286,10 @@ inference to be expanded in the future. | Model | Prec. | IGIE | ixRT | IXUCA SDK | |---------------|-------|---------------------------------------|---------------------------------------|-----------| +| CRNN | FP16 | | [✅](models/cv/ocr/crnn/ixrt) | 4.4.0 | +| DBNet | FP16 | | [✅](models/cv/ocr/dbnet/ixrt) | 4.4.0 | | Kie_layoutXLM | FP16 | [✅](models/cv/ocr/kie_layoutxlm/igie) | | 4.3.0 | | SVTR | FP16 | [✅](models/cv/ocr/svtr/igie) | | 4.3.0 | -| CRNN | FP16 | | [✅](models/cv/ocr/crnn/ixrt) | 4.4.0 | #### Pose Estimation @@ -307,6 +310,7 @@ inference to be expanded in the future. | Model | Prec. | IGIE | ixRT | IXUCA SDK | |-------|-------|------------------------------------------------|------------------------------------------------|-----------| +| DDRNet | FP16 | | [✅](models/cv/semantic_segmentation/ddrnet/ixrt) | 4.4.0 | | UNet | FP16 | [✅](models/cv/semantic_segmentation/unet/igie) | [✅](models/cv/semantic_segmentation/unet/ixrt) | 4.3.0 | #### Multi-Object Tracking @@ -367,6 +371,7 @@ inference to be expanded in the future. | Model | Prec. | IGIE | ixRT | IXUCA SDK | |-----------------|-------|-----------------------------------------------------|-----------------------------------------------------------|-----------| | Conformer | FP16 | [✅](models/audio/speech_recognition/conformer/igie) | [✅](models/audio/speech_recognition/conformer/ixrt) | 4.3.0 | +| DeepSpeech2 | FP16 | | [✅](models/speech/speech_recognition/deepspeech2/ixrt) | 4.4.0 | | Transformer ASR | FP16 | | [✅](models/audio/speech_recognition/transformer_asr/ixrt) | 4.2.0 | ### Others diff --git a/models/cv/classification/deit_b/ixrt/README.md b/models/cv/classification/deit_b/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc315f978fe6349cc64ec5462ab82ef63381bc50 --- /dev/null +++ b/models/cv/classification/deit_b/ixrt/README.md @@ -0,0 +1,64 @@ +# DeiT-Base (ixRT) + +## Model Description + +DeiT-Base (Data-efficient Image Transformer Base) is a vision transformer model that uses knowledge distillation to achieve competitive performance with fewer training resources. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Download the [imagenet](https://www.image-net.org/download.php) to download the validation dataset. + +### Install Dependencies + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx + +pip3 install -r ../../ixrt_common/requirements.txt +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/deit_b.onnx +``` + +## Model Inference + +```bash +export PROJ_DIR=./ +export DATASETS_DIR=/path/to/imagenet_val/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=../../ixrt_common/ +export CONFIG_DIR=../../ixrt_common/config/DEIT_B_CONFIG +``` + +### FP16 + +```bash +# Test ACC +bash scripts/infer_deit_b_fp16_accuracy.sh +# Test FPS +bash scripts/infer_deit_b_fp16_performance.sh +``` + +## Model Results + +| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) | +| ----------- | --------- | --------- | ------- | -------- | -------- | +| DeiT-Base | 32 | FP16 | 596.381 | 81.7 | 95.6 | diff --git a/models/cv/classification/deit_b/ixrt/ci/prepare.sh b/models/cv/classification/deit_b/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..249358b92e91857270d0b16bb2764aa2fdfea99b --- /dev/null +++ b/models/cv/classification/deit_b/ixrt/ci/prepare.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install tqdm onnxsim opencv-python==4.6.0.66 + +mkdir -p checkpoints +cp /root/data/checkpoints/deit_b.onnx checkpoints/ \ No newline at end of file diff --git a/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_accuracy.sh b/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..e8c90b6873cc34410d9db6ddf29ba15371a1bd66 --- /dev/null +++ b/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_accuracy.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=0.796 +WARM_UP=0 +LOOP_COUNT=-1 +RUN_MODE=ACC +PRECISION=float16 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model Skipped, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} || exit 1 + echo " "Generate ${SIM_MODEL} +fi + +# Refine Model +let step++ +echo; +echo [STEP ${step}] : Refine Model +REFINE_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_refine.onnx +if [ -f ${REFINE_MODEL} ];then + echo " "Refine Model Skipped, ${REFINE_MODEL} has been existed +else + python3 ${RUN_DIR}/refine_model.py \ + --onnx_path ${SIM_MODEL} \ + --dst_onnx_path ${REFINE_MODEL} \ + --bsz ${BSZ} \ + --imgsz ${IMGSIZE} || exit 1 +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skipped, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py \ + --batch_size ${BSZ} \ + --origin_model ${REFINE_MODEL} \ + --output_model ${FINAL_MODEL} || exit 1 + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} || exit 1 + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --acc_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_performance.sh b/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d41dafae6f08256c305d1ecaf32ff824973239d --- /dev/null +++ b/models/cv/classification/deit_b/ixrt/scripts/infer_deit_b_fp16_performance.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=0.796 +WARM_UP=10 +LOOP_COUNT=20 +RUN_MODE=FPS +PRECISION=float16 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model Skipped, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi + +# Refine Model +let step++ +echo; +echo [STEP ${step}] : Refine Model +REFINE_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_refine.onnx +if [ -f ${REFINE_MODEL} ];then + echo " "Refine Model Skipped, ${REFINE_MODEL} has been existed +else + python3 ${RUN_DIR}/refine_model.py \ + --onnx_path ${SIM_MODEL} \ + --dst_onnx_path ${REFINE_MODEL} \ + --bsz ${BSZ} \ + --imgsz ${IMGSIZE} +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skipped, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py \ + --batch_size ${BSZ} \ + --origin_model ${REFINE_MODEL} \ + --output_model ${FINAL_MODEL} + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --fps_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/ixrt_common/config/DEIT_B_CONFIG b/models/cv/classification/ixrt_common/config/DEIT_B_CONFIG new file mode 100644 index 0000000000000000000000000000000000000000..4170a28b9a0ff4d8518aee4cbe302a3aec2375f6 --- /dev/null +++ b/models/cv/classification/ixrt_common/config/DEIT_B_CONFIG @@ -0,0 +1,33 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# IMGSIZE : 模型输入hw大小 +# MODEL_NAME : 生成onnx/engine的basename +# ORIGINE_MODEL : 原始onnx文件名称 +IMGSIZE=224 +MODEL_NAME=Deit_base +ORIGINE_MODEL=deit_b.onnx + +# QUANT CONFIG (仅PRECISION为int8时生效) + # QUANT_OBSERVER : 量化策略,可选 [hist_percentile, percentile, minmax, entropy, ema] + # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致,有些op可能推导shape错误(比如Reshape) + # QUANT_STEP : 量化步数 + # QUANT_SEED : 随机种子 保证量化结果可复现 + # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写 +QUANT_OBSERVER=hist_percentile +QUANT_BATCHSIZE=1 +QUANT_STEP=32 +QUANT_SEED=42 +DISABLE_QUANT_LIST= +QUANT_EXIST_ONNX= \ No newline at end of file diff --git a/models/cv/classification/ixrt_common/config/MOBILENET_V1_CONFIG b/models/cv/classification/ixrt_common/config/MOBILENET_V1_CONFIG new file mode 100644 index 0000000000000000000000000000000000000000..c49b9b6a30c346b64106ea68609152fee1ca0c32 --- /dev/null +++ b/models/cv/classification/ixrt_common/config/MOBILENET_V1_CONFIG @@ -0,0 +1,33 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# IMGSIZE : 模型输入hw大小 +# MODEL_NAME : 生成onnx/engine的basename +# ORIGINE_MODEL : 原始onnx文件名称 +IMGSIZE=224 +MODEL_NAME=MobileNet_v1 +ORIGINE_MODEL=mobilenet_v1.onnx + +# QUANT CONFIG (仅PRECISION为int8时生效) + # QUANT_OBSERVER : 量化策略,可选 [hist_percentile, percentile, minmax, entropy, ema] + # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致,有些op可能推导shape错误(比如Reshape) + # QUANT_STEP : 量化步数 + # QUANT_SEED : 随机种子 保证量化结果可复现 + # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写 +QUANT_OBSERVER=percentile +QUANT_BATCHSIZE=1 +QUANT_STEP=32 +QUANT_SEED=42 +DISABLE_QUANT_LIST="fc7 prob" +QUANT_EXIST_ONNX= \ No newline at end of file diff --git a/models/cv/classification/ixrt_common/config/VIT_CONFIG b/models/cv/classification/ixrt_common/config/VIT_CONFIG new file mode 100644 index 0000000000000000000000000000000000000000..0e46d9e9e2e86b0f3051593584b70ca77b5360bf --- /dev/null +++ b/models/cv/classification/ixrt_common/config/VIT_CONFIG @@ -0,0 +1,33 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# IMGSIZE : 模型输入hw大小 +# MODEL_NAME : 生成onnx/engine的basename +# ORIGINE_MODEL : 原始onnx文件名称 +IMGSIZE=224 +MODEL_NAME=vit_b_16 +ORIGINE_MODEL=vit_b_16_sim.onnx + +# QUANT CONFIG (仅PRECISION为int8时生效) + # QUANT_OBSERVER : 量化策略,可选 [hist_percentile, percentile, minmax, entropy, ema] + # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致,有些op可能推导shape错误(比如Reshape) + # QUANT_STEP : 量化步数 + # QUANT_SEED : 随机种子 保证量化结果可复现 + # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写 +QUANT_OBSERVER=minmax +QUANT_BATCHSIZE=32 +QUANT_STEP=32 +QUANT_SEED=42 +DISABLE_QUANT_LIST= +QUANT_EXIST_ONNX= \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/README.md b/models/cv/classification/mobilenet_v1/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6451f4cb4b73cf03daa4223fb9facd35504804cb --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/README.md @@ -0,0 +1,74 @@ +# MobileNetV1 (ixRT) + +## Model Description + +MobileNetV1 is a efficient model architecture using depthwise separable convolutions. It is designed to efficiently maximize accuracy while being mindful of the tight resource constraints. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Download the [imagenet](https://www.image-net.org/download.php) to download the validation dataset. + +### Install Dependencies + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx + +pip3 install -r ../../ixrt_common/requirements.txt +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/mobilenet_v1.onnx +``` + +## Model Inference + +```bash +export PROJ_DIR=./ +export DATASETS_DIR=/path/to/imagenet_val/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=../../ixrt_common/ +export CONFIG_DIR=../../ixrt_common/config/MOBILENET_V1_CONFIG +``` + +### FP16 + +```bash +# Test ACC +bash scripts/infer_mobilenet_v1_fp16_accuracy.sh +# Test FPS +bash scripts/infer_mobilenet_v1_fp16_performance.sh +``` + +### INT8 + +```bash +# Test ACC +bash scripts/infer_mobilenet_v1_int8_accuracy.sh +# Test FPS +bash scripts/infer_mobilenet_v1_int8_performance.sh +``` + +## Model Results + +| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) | +| ----------- | --------- | --------- | ------- | -------- | -------- | +| MobileNetV1 | 32 | FP16 | 13862.317 | 71.6 | 90.3 | +| MobileNetV1 | 32 | INT8 | 17485.601 | 70.9 | 89.9 | \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/ci/prepare.sh b/models/cv/classification/mobilenet_v1/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..02b34ea03d1e2d971ba1bb5de584df38cf41b553 --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/ci/prepare.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install tqdm onnxsim opencv-python==4.6.0.66 + +mkdir -p checkpoints +cp /root/data/checkpoints/mobilenet_v1.onnx checkpoints/ \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_accuracy.sh b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..ffe54388de37b72d2a7a6282d5c388842b19cd95 --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_accuracy.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=-1 +WARM_UP=0 +LOOP_COUNT=-1 +RUN_MODE=ACC +PRECISION=float16 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi + +# Quant Model +if [ $PRECISION == "int8" ];then + let step++ + echo; + echo [STEP ${step}] : Quant Model + if [[ -z ${QUANT_EXIST_ONNX} ]];then + QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx + fi + if [[ -f ${QUANT_EXIST_ONNX} ]];then + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed + else + python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --model_name ${MODEL_NAME} \ + --dataset_dir ${DATASETS_DIR} \ + --observer ${QUANT_OBSERVER} \ + --disable_quant_names ${DISABLE_QUANT_LIST[@]} \ + --save_dir $CHECKPOINTS_DIR \ + --bsz ${QUANT_BATCHSIZE} \ + --step ${QUANT_STEP} \ + --seed ${QUANT_SEED} \ + --imgsz ${IMGSIZE} + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Generate ${SIM_MODEL} + fi +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skip, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \ + --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL} + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --acc_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_performance.sh b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..43a0f216ac9884236f12bf5358c200a59ce290b3 --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_fp16_performance.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=-1 +WARM_UP=3 +LOOP_COUNT=20 +RUN_MODE=FPS +PRECISION=float16 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi + +# Quant Model +if [ $PRECISION == "int8" ];then + let step++ + echo; + echo [STEP ${step}] : Quant Model + if [[ -z ${QUANT_EXIST_ONNX} ]];then + QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx + fi + if [[ -f ${QUANT_EXIST_ONNX} ]];then + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed + else + python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --model_name ${MODEL_NAME} \ + --dataset_dir ${DATASETS_DIR} \ + --observer ${QUANT_OBSERVER} \ + --disable_quant_names ${DISABLE_QUANT_LIST[@]} \ + --save_dir $CHECKPOINTS_DIR \ + --bsz ${QUANT_BATCHSIZE} \ + --step ${QUANT_STEP} \ + --seed ${QUANT_SEED} \ + --imgsz ${IMGSIZE} + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Generate ${SIM_MODEL} + fi +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skip, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \ + --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL} + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --fps_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_accuracy.sh b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..ab3292a656efc48b156509bb121a6104f948c81f --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_accuracy.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=-1 +WARM_UP=0 +LOOP_COUNT=-1 +RUN_MODE=ACC +PRECISION=int8 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi + +# Quant Model +if [ $PRECISION == "int8" ];then + let step++ + echo; + echo [STEP ${step}] : Quant Model + if [[ -z ${QUANT_EXIST_ONNX} ]];then + QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx + fi + if [[ -f ${QUANT_EXIST_ONNX} ]];then + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed + else + python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --model_name ${MODEL_NAME} \ + --dataset_dir ${DATASETS_DIR} \ + --observer ${QUANT_OBSERVER} \ + --disable_quant_names ${DISABLE_QUANT_LIST[@]} \ + --save_dir $CHECKPOINTS_DIR \ + --bsz ${QUANT_BATCHSIZE} \ + --step ${QUANT_STEP} \ + --seed ${QUANT_SEED} \ + --imgsz ${IMGSIZE} + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Generate ${SIM_MODEL} + fi +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skip, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \ + --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL} + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --acc_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_performance.sh b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8c63eab8a24ff4681bcfd94c44ed73971781dd6 --- /dev/null +++ b/models/cv/classification/mobilenet_v1/ixrt/scripts/infer_mobilenet_v1_int8_performance.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +BSZ=32 +TGT=-1 +WARM_UP=3 +LOOP_COUNT=20 +RUN_MODE=FPS +PRECISION=int8 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +source ${CONFIG_DIR} +ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL} + +echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR} +echo DATASETS_DIR : ${DATASETS_DIR} +echo RUN_DIR : ${RUN_DIR} +echo CONFIG_DIR : ${CONFIG_DIR} +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} + +step=0 +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx + +# Simplify Model +let step++ +echo; +echo [STEP ${step}] : Simplify Model +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + python3 ${RUN_DIR}/simplify_model.py \ + --origin_model $ORIGINE_MODEL \ + --output_model ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi + +# Quant Model +if [ $PRECISION == "int8" ];then + let step++ + echo; + echo [STEP ${step}] : Quant Model + if [[ -z ${QUANT_EXIST_ONNX} ]];then + QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx + fi + if [[ -f ${QUANT_EXIST_ONNX} ]];then + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed + else + python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --model_name ${MODEL_NAME} \ + --dataset_dir ${DATASETS_DIR} \ + --observer ${QUANT_OBSERVER} \ + --disable_quant_names ${DISABLE_QUANT_LIST[@]} \ + --save_dir $CHECKPOINTS_DIR \ + --bsz ${QUANT_BATCHSIZE} \ + --step ${QUANT_STEP} \ + --seed ${QUANT_SEED} \ + --imgsz ${IMGSIZE} + SIM_MODEL=${QUANT_EXIST_ONNX} + echo " "Generate ${SIM_MODEL} + fi +fi + +# Change Batchsize +let step++ +echo; +echo [STEP ${step}] : Change Batchsize +FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx +if [ -f $FINAL_MODEL ];then + echo " "Change Batchsize Skip, $FINAL_MODEL has been existed +else + python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \ + --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL} + echo " "Generate ${FINAL_MODEL} +fi + +# Build Engine +let step++ +echo; +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ -f $ENGINE_FILE ];then + echo " "Build Engine Skip, $ENGINE_FILE has been existed +else + python3 ${RUN_DIR}/build_engine.py \ + --precision ${PRECISION} \ + --model ${FINAL_MODEL} \ + --engine ${ENGINE_FILE} + echo " "Generate Engine ${ENGINE_FILE} +fi + +# Inference +let step++ +echo; +echo [STEP ${step}] : Inference +python3 ${RUN_DIR}/inference.py \ + --engine_file=${ENGINE_FILE} \ + --datasets_dir=${DATASETS_DIR} \ + --imgsz=${IMGSIZE} \ + --warm_up=${WARM_UP} \ + --loop_count ${LOOP_COUNT} \ + --test_mode ${RUN_MODE} \ + --fps_target ${TGT} \ + --bsz ${BSZ}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/classification/vit/ixrt/README.md b/models/cv/classification/vit/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e0e7fb7e0a201359b59fb4c0b5c97e6ece45bfbf --- /dev/null +++ b/models/cv/classification/vit/ixrt/README.md @@ -0,0 +1,64 @@ +# Vision Transformer (ViT) (ixRT) + +## Model Description + +Vision Transformer (ViT) applies a pure transformer to images without any convolution. It divides an image into patches and processes them through transformer layers. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Download the [imagenet](https://www.image-net.org/download.php) to download the validation dataset. + +### Install Dependencies + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx + +pip3 install -r ../../ixrt_common/requirements.txt +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/vit_b_16_sim.onnx +``` + +## Model Inference + +```bash +export PROJ_DIR=./ +export DATASETS_DIR=/path/to/imagenet_val/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=../../ixrt_common/ +export CONFIG_DIR=../../ixrt_common/config/VIT_CONFIG +``` + +### FP16 + +```bash +# Test ACC +bash scripts/infer_vit_fp16_accuracy.sh +# Test FPS +bash scripts/infer_vit_fp16_performance.sh +``` + +## Model Results + +| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) | +| ----------- | --------- | --------- | ------- | -------- | -------- | +| ViT-B/16 | 32 | FP16 | 461.038 | 81.1 | 95.3 | \ No newline at end of file diff --git a/models/cv/classification/vit/ixrt/ci/prepare.sh b/models/cv/classification/vit/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..c12ce37bf8fca6326d708692c5cbe58941999e97 --- /dev/null +++ b/models/cv/classification/vit/ixrt/ci/prepare.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install tqdm onnxsim opencv-python==4.6.0.66 + +mkdir -p checkpoints +cp /root/data/checkpoints/vit_b_16_sim.onnx checkpoints/ \ No newline at end of file diff --git a/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_accuracy.sh b/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..1d6c116c77a8f943c638a37b6cb232b220c906ca --- /dev/null +++ b/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_accuracy.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +set -euo pipefail + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + echo "fails" + EXIT_STATUS=1 + fi +} +# Run paraments +warm_up=10 +BSZ=32 +TGT=-1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +datasets_dir=${DATASETS_DIR} +onnx_model=${CHECKPOINTS_DIR}/vit_b_16_sim.onnx +engine_file=${CHECKPOINTS_DIR}/vit_b_16.engine + +echo "Build Fp16 Engine!" +python3 ${RUN_DIR}/build_engine.py \ + --precision float16 \ + --model ${onnx_model} \ + --engine ${engine_file}; check_status + +echo "Fp16 Inference Acc!" +python3 ${RUN_DIR}/inference.py \ + --test_mode ACC \ + --engine_file ${engine_file} \ + --datasets_dir ${datasets_dir} \ + --warm_up ${warm_up} \ + --bsz ${BSZ} \ + --acc_target ${TGT}; check_status \ No newline at end of file diff --git a/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_performance.sh b/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea1f93cbdc474cf6b0a59f8b779f292381ae646c --- /dev/null +++ b/models/cv/classification/vit/ixrt/scripts/infer_vit_fp16_performance.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +set -euo pipefail + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + echo "fails" + EXIT_STATUS=1 + fi +} +# Run paraments +warm_up=10 +loop_count=50 +BSZ=32 +TGT=-1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +datasets_dir=${DATASETS_DIR} +onnx_model=${CHECKPOINTS_DIR}/vit_b_16_sim.onnx +engine_file=${CHECKPOINTS_DIR}/vit_b_16.engine + +echo "Build Fp16 Engine!" +python3 ${RUN_DIR}/build_engine.py \ + --precision float16 \ + --model ${onnx_model} \ + --engine ${engine_file}; check_status + +echo "Fp16 Inference Fps!" +python3 ${RUN_DIR}/inference.py \ + --test_mode FPS \ + --engine_file ${engine_file} \ + --datasets_dir ${datasets_dir} \ + --warm_up ${warm_up} \ + --bsz ${BSZ} \ + --loop_count ${loop_count} \ + --fps_target ${TGT}; check_status \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/README.md b/models/cv/ocr/dbnet/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dccaaebfb0666bcda4e005d30a1faed16d337306 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/README.md @@ -0,0 +1,67 @@ +# DBNet (ixRT) + +## Model Description + +DBNet (Differentiable Binarization Network) is a scene text detection model that uses a differentiable binarization process for robust text detection. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained models: +- r50_en_dbnet: + +Dataset: ICDAR 2015 + +### Install Dependencies + +```bash +pip3 install shapely pyclipper opencv-python==4.6.0.66 tqdm +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/r50_en_dbnet.onnx +``` + +## Model Inference + +```bash +export DATASETS_DIR=/path/to/icdar2015/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=./ +``` + +### FP16 + +```bash +# Test ACC +bash scripts/infer_dbnet_fp16_accuracy.sh +# Test FPS +bash scripts/infer_dbnet_fp16_performance.sh +``` + +### INT8 + +```bash +# Test ACC +bash scripts/infer_dbnet_int8_accuracy.sh +# Test FPS +bash scripts/infer_dbnet_int8_performance.sh +``` + +## Model Results + +| Model | Backbone | BatchSize | Precision | FPS | Hmean | +| ----------- | -------- | --------- | --------- | ------- | ------- | +| DBNet | r50_en | 32 | FP16 | 143.85 | 0.803 | +| DBNet | r50_en | 32 | INT8 | 143.73 | 0.803 | \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/build_engine.py b/models/cv/ocr/dbnet/ixrt/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..96c4435dc6445cbe9c9daa299d6bb355c1a3d7ff --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/build_engine.py @@ -0,0 +1,49 @@ +import os +import cv2 +import argparse +import numpy as np + +import torch +import tensorrt + +from tensorrt import Dims + + +def main(config): + + input_shape = [args.batch_size,3, 736,1280] + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + parser.parse_from_file(config.model) + + precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16 + build_config.set_flag(precision) + if config.precision == "int8": + build_config.set_flag(tensorrt.BuilderFlag.FP16) + + input_tensor = network.get_input(0) + input_tensor.shape = Dims(input_shape) + + plan = builder.build_serialized_network(network, build_config) + engine_file_path = config.engine + with open(engine_file_path, "wb") as f: + f.write(plan) + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str,default="wide_deep.onnx") + parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="float16", + help="The precision of datatype") + parser.add_argument("--engine", type=str, default="wide_deep.engine") + parser.add_argument("--batch_size", type=int, default=1) + + args = parser.parse_args() + return args + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/ci/prepare.sh b/models/cv/ocr/dbnet/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..d2101a9f91f90986bd5828e6940e2ecded6886a1 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/ci/prepare.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +pip3 install shapely pyclipper opencv-python==4.6.0.66 tqdm + +mkdir -p checkpoints +cp /root/data/checkpoints/r50_en_dbnet.onnx checkpoints/ \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/dbnet_inference.py b/models/cv/ocr/dbnet/ixrt/dbnet_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0f3bda1d3c8daab5bc05e7d6c4a47da0a9e13c17 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/dbnet_inference.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import cv2 +import numpy as np +import argparse +import sys + + +from cuda import cuda, cudart +import torch +import tensorrt + +from util.common import eval_batch, create_engine_context, get_io_bindings + +from util import TextDetector + +def check_target(inference, target): + satisfied = False + if inference > target: + satisfied = True + return satisfied + +process_configs ={ + #pre process config + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': 1./255., + 'image_shape':(1280,736),#width height + + #post precess config + 'thresh':0.3, + 'box_thresh':0.5, + 'max_candidates':1000, + 'unclip_ratio':2, + 'use_dilation':False, + 'score_mode':'fast', + 'box_type':'quad', + 'batch_size':1 + +} + +def make_parser(): + parser = argparse.ArgumentParser("DBnet Eval") + parser.add_argument("--datasets_dir", type=str, default="data/icdar_2015_images", help="datasets dir ") + parser.add_argument("--engine_file", type=str, default="data/unit_test_r50_en_dbnet_bin/int8_r50_en_dbnet.engine", help="weights dir") + + parser.add_argument("-b", "--batch_size", type=int, default=1, help="batch size") + parser.add_argument("-d", "--device", default=1, type=int, help="device for val") + parser.add_argument("--img_height", default=736, type=int, help="test img height") + parser.add_argument("--img_width", default=1280, type=int, help="test img width") + parser.add_argument("--target_hmean", default=0.82, type=float, help="target Hmean") + parser.add_argument("--target_fps", default=30, type=float, help="target Hmean") + + parser.add_argument("--target", default="precision", type=str, help="precision or pref") + parser.add_argument("--warm_up", default=20, type=int , help="warm_up") + parser.add_argument("--loop_count", default=100, type=int , help="loop_count") + parser.add_argument("--seed", default=None, type=int, help="eval seed") + return parser + + + + +def eval(args): + + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + engine, context = create_engine_context(args.engine_file, logger) + + process_configs["image_dir"]=args.datasets_dir + process_configs["label_dir"] = args.datasets_dir + process_configs["image_shape"]= (args.img_height,args.img_width) + process_configs["batch_size"] = args.batch_size + db_det = TextDetector(engine,context,process_configs) + if args.target=="precision": + metrics = db_det.eval_icdar_2015(args.datasets_dir,args.batch_size) + print("="*40) + print("Precision:{0},Recall:{1},Hmean:{2}".format(round(metrics["precision"],3),round(metrics["recall"],3),round(metrics["hmean"],3))) + print("="*40) + print(f"Check hmean Test : {round(metrics['hmean'],3)} Target:{args.target_hmean} \ + State : {'Pass' if round(metrics['hmean'],3) >= args.target_hmean else 'Fail'}") + status_hmean = check_target(metrics["hmean"], args.target_hmean) + metricResult = {"metricResult": {}} + metricResult["metricResult"]["hmean"] = round(metrics["hmean"], 3) + print(metricResult) + sys.exit(int(not (status_hmean))) + else: + fps = db_det.perf(args.warm_up,args.loop_count,args.batch_size) + print("="*40) + print("fps:{0}".format(round(fps,2))) + print("="*40) + print(f"Check fps Test : {round(fps,3)} Target:{args.target_fps} State : {'Pass' if fps >= args.target_fps else 'Fail'}") + status_fps = check_target(fps, args.target_fps) + metricResult = {"metricResult": {}} + metricResult["metricResult"]["fps"] = round(fps, 3) + print(metricResult) + sys.exit(int(not (status_fps))) + +if __name__ == "__main__": + args = make_parser().parse_args() + eval(args) + + + + diff --git a/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_accuracy.sh b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..8ca2a706b393e95c335092592e7c3831ec2b34d0 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_accuracy.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +BSZ=16 +TGT=0.67 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +DATASETS_DIR="/root/data/datasets/icdar_2015/icdar_2015_images" +CHECKPOINTS_DIR="./checkpoints" +RUN_DIR="${RUN_DIR:-.}" + +python3 ${RUN_DIR}/build_engine.py --model=${CHECKPOINTS_DIR}/r50_en_dbnet.onnx\ + --engine=${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine\ + --batch_size=${BSZ}\ + --precision="int8" + +python3 ${RUN_DIR}/dbnet_inference.py \ + --datasets_dir ${DATASETS_DIR} \ + --engine_file ${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine \ + --target "precision" \ + --batch_size ${BSZ} \ + --target_hmean ${TGT}; check_status + + exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_performance.sh b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..6dcde6c780ce5b513dbdf161dc536a81d7ba7ec8 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_fp16_performance.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +BSZ=16 +TGT=-1 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +DATASETS_DIR="/root/data/datasets/icdar_2015/icdar_2015_images" +CHECKPOINTS_DIR="./checkpoints" +RUN_DIR="${RUN_DIR:-.}" + +python3 ${RUN_DIR}/build_engine.py --model=${CHECKPOINTS_DIR}/r50_en_dbnet.onnx\ + --engine=${CHECKPOINTS_DIR}/float16_r50_en_dbnet.engine\ + --batch_size=${BSZ}\ + --precision="float16" + +python3 ${RUN_DIR}/dbnet_inference.py \ + --engine_file ${CHECKPOINTS_DIR}/float16_r50_en_dbnet.engine \ + --target "perf" \ + --batch_size ${BSZ} \ + --target_fps ${TGT};check_status + + exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_accuracy.sh b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..8ca2a706b393e95c335092592e7c3831ec2b34d0 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_accuracy.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +BSZ=16 +TGT=0.67 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +DATASETS_DIR="/root/data/datasets/icdar_2015/icdar_2015_images" +CHECKPOINTS_DIR="./checkpoints" +RUN_DIR="${RUN_DIR:-.}" + +python3 ${RUN_DIR}/build_engine.py --model=${CHECKPOINTS_DIR}/r50_en_dbnet.onnx\ + --engine=${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine\ + --batch_size=${BSZ}\ + --precision="int8" + +python3 ${RUN_DIR}/dbnet_inference.py \ + --datasets_dir ${DATASETS_DIR} \ + --engine_file ${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine \ + --target "precision" \ + --batch_size ${BSZ} \ + --target_hmean ${TGT}; check_status + + exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_performance.sh b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..701f2496bedb3866d13ec29fa230d072a413519d --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/scripts/infer_dbnet_int8_performance.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +BSZ=16 +TGT=-1 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + esac +done + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +DATASETS_DIR="/root/data/datasets/icdar_2015/icdar_2015_images" +CHECKPOINTS_DIR="./checkpoints" +RUN_DIR="${RUN_DIR:-.}" + +python3 ${RUN_DIR}/build_engine.py --model=${CHECKPOINTS_DIR}/r50_en_dbnet.onnx\ + --engine=${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine\ + --batch_size=${BSZ}\ + --precision="int8" + +python3 ${RUN_DIR}/dbnet_inference.py \ + --engine_file ${CHECKPOINTS_DIR}/int8_r50_en_dbnet.engine \ + --target "perf" \ + --batch_size ${BSZ} \ + --target_fps ${TGT};check_status + + exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/util/__init__.py b/models/cv/ocr/dbnet/ixrt/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b3184b72164a55054d711d5fc7fc982405cf434 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/util/__init__.py @@ -0,0 +1,4 @@ +from .dbnet_det import TextDetector + + + diff --git a/models/cv/ocr/dbnet/ixrt/util/common.py b/models/cv/ocr/dbnet/ixrt/util/common.py new file mode 100644 index 0000000000000000000000000000000000000000..1b88ab9ed438ca9d1fdd76388ca783b5ad10c008 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/util/common.py @@ -0,0 +1,66 @@ +import os +import cv2 +import glob +import torch +import tensorrt +import numpy as np +from cuda import cuda, cudart + +def eval_batch(batch_score, batch_label): + batch_score = torch.tensor(torch.from_numpy(batch_score), dtype=torch.float32) + values, indices = batch_score.topk(5) + top1, top5 = 0, 0 + for idx, label in enumerate(batch_label): + + if label == indices[idx][0]: + top1 += 1 + if label in indices[idx]: + top5 += 1 + return top1, top5 + +def create_engine_context(engine_path, logger): + with open(engine_path, "rb") as f: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + +def get_io_bindings(engine): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = engine.get_binding_shape(i) + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + for s in shape: + size *= s + err, allocation = cudart.cudaMalloc(size) + assert err == cudart.cudaError_t.cudaSuccess + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size, + } + print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}") + allocations.append(allocation) + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + return inputs, outputs, allocations \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/util/db_postprocess.py b/models/cv/ocr/dbnet/ixrt/util/db_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..20212d7dc0b4978878cf630c848b46decabb1cd3 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/util/db_postprocess.py @@ -0,0 +1,249 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +from shapely.geometry import Polygon +import pyclipper + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.27, + box_thresh=0.6, + max_candidates=1000, + unclip_ratio=1.7, + use_dilation=False, + score_mode="fast", + box_type='quad', + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + self.box_type = box_type + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + boxes = [] + scores = [] + + contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), + cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + + for contour in contours[:self.max_candidates]: + epsilon = 0.002 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + points = approx.reshape((-1, 2)) + if points.shape[0] < 4: + continue + + score = self.box_score_fast(pred, points.reshape(-1, 2)) + if self.box_thresh > score: + continue + + if points.shape[0] > 2: + box = self.unclip(points, self.unclip_ratio) + if len(box) > 1: + continue + else: + continue + box = box.reshape(-1, 2) + + _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) + if sside < self.min_size + 2: + continue + + box = np.array(box) + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.tolist()) + scores.append(score) + return boxes, scores + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height,pad_w,pad_h,scale): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = box[:, 0]-pad_w + box[:, 1] = box[:, 1]-pad_h + + box[:, 0] = np.clip( + np.round(box[:, 0] / scale), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / scale), 0, dest_height) + + # box[:, 0] = np.clip( + # np.round(box[:, 0] / width * dest_width), 0, dest_width) + # box[:, 1] = np.clip( + # np.round(box[:, 1] / height * dest_height), 0, dest_height) + + #box[:, 0] = box[:, 0]+(int)(pad_w*dest_width/width) + #box[:, 1] = box[:, 1]+(int)(pad_h*dest_height/height ) + + + #box[:, 0] = np.clip(np.round(box[:, 0]) , 0, 1280) + #box[:, 1] = np.clip(np.round(box[:, 1]) , 0, 736) + + boxes.append(box.astype("int32")) + scores.append(score) + return np.array(boxes, dtype="int32"), scores + + def unclip(self, box, unclip_ratio): + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + # if isinstance(pred, paddle.Tensor): + # pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + boxes_batch = [] + for batch_index in range(shape_list.shape[0]): + src_h, src_w,pad_h,pad_w,scale= shape_list[batch_index] + + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + if self.box_type == 'poly': + boxes, scores = self.polygons_from_bitmap(pred[batch_index], + mask, src_w, src_h) + elif self.box_type == 'quad': + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, + src_w, src_h,pad_w,pad_h,scale) + else: + raise ValueError("box_type can only be one of ['quad', 'poly']") + + boxes_batch.append({'points': boxes,"scores":scores}) + return boxes_batch + + + + \ No newline at end of file diff --git a/models/cv/ocr/dbnet/ixrt/util/dbnet_det.py b/models/cv/ocr/dbnet/ixrt/util/dbnet_det.py new file mode 100644 index 0000000000000000000000000000000000000000..4f56493d9a468eafba0edfe8a3f599042a3108f1 --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/util/dbnet_det.py @@ -0,0 +1,235 @@ +import numpy as np +import cv2 +import glob +import os +import math +from tqdm import tqdm +import time +from .db_postprocess import DBPostProcess +from .eval_det_iou import DetectionIoUEvaluator +from .common import get_io_bindings +import torch +from cuda import cuda, cudart + + +def img2label_paths(img_paths): + # Define label paths as a function of image paths + sa, sb = f'{os.sep}icdar_2015_images{os.sep}', f'{os.sep}icdar_2015_labels{os.sep}gt_' # /images/, /labels/ substrings + + return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths] + +def rotate(angle, x, y): + """ + 基于原点的弧度旋转 + + :param angle: 弧度 + :param x: x + :param y: y + :return: + """ + rotatex = math.cos(angle) * x - math.sin(angle) * y + rotatey = math.cos(angle) * y + math.sin(angle) * x + return rotatex, rotatey + +def xy_rorate(theta, x, y, centerx, centery): + """ + 针对中心点进行旋转 + + :param theta: + :param x: + :param y: + :param centerx: + :param centery: + :return: + """ + r_x, r_y = rotate(theta, x - centerx, y - centery) + return centerx+r_x, centery+r_y + +def rec_rotate(x, y, width, height, theta): + """ + 传入矩形的x,y和宽度高度,弧度,转成QUAD格式 + :param x: + :param y: + :param width: + :param height: + :param theta: + :return: + """ + centerx = x + width / 2 + centery = y + height / 2 + + x1, y1 = xy_rorate(theta, x, y, centerx, centery) + x2, y2 = xy_rorate(theta, x+width, y, centerx, centery) + x3, y3 = xy_rorate(theta, x, y+height, centerx, centery) + x4, y4 = xy_rorate(theta, x+width, y+height, centerx, centery) + + return [(int(x1), int(y1)), (int(x2), int(y2)), (int(x4), int(y4)), (int(x3), int(y3))] #clock wise + + +def letterbox(im, new_shape=(736, 1280), color=(114, 114, 114), auto=False, scaleup=True, stride=32): + # Resize and pad image while meeting stride-multiple constraints + + shape = im.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better val mAP) + r = min(r, 1.0) + + # Compute padding + new_unpad = int(round(shape[0] * r)), int(round(shape[1] * r)) + dw, dh = new_shape[1] - new_unpad[1], new_shape[0] - new_unpad[0] # wh padding + if auto: # minimum rectangle + dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape != new_unpad: # resize + im = cv2.resize(im, new_unpad[::-1], interpolation=cv2.INTER_LINEAR) + + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + im1 = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + return im1, r, dw, dh + + +def draw_det_res(img,dt_boxes): + if len(dt_boxes) > 0: + src_im = img + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 114, 255), thickness=2) + cv2.imwrite("det3.jpg", src_im) + + +class TextDetector(object): + def __init__(self,engine,context, configs): + self.engine= engine + self.context = context + self.configs = configs + self.postprocess = DBPostProcess() + def batch_forward(self,inputs,outputs,allocations,batch_data,shape_list): + + output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + input = np.zeros(inputs[0]["shape"], inputs[0]["dtype"]) + real_batch = batch_data.shape[0] + batch_data= np.transpose(batch_data,[0,3,1,2]) + batch_data = batch_data.astype(inputs[0]["dtype"]) + batch_data = np.ascontiguousarray(batch_data) + input[:real_batch, :, :, :] = batch_data + + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + self.context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + outs_dict={"maps":output} + post_result = self.postprocess(outs_dict,shape_list) + return post_result + + def get_dataloader(self,datasets_dir, bsz): + image_files= glob.glob(str(datasets_dir+"/*")) + batch_img, shape_list, batch_label_files = [],[],[] + label_files = img2label_paths(image_files) + for image_file,label_file in zip(image_files,label_files): + img = cv2.imread(image_file) + letter_img, img, org_img, scale, pad_w, pad_h = self.pre_process(image_file) + shape_list.append([img.shape[0],img.shape[1],pad_h,pad_w,scale]) + batch_img.append(np.expand_dims(img, 0)) + batch_label_files.append(label_file) + if len(batch_img) == bsz: + yield np.concatenate(batch_img, 0), np.array(shape_list).astype(np.int32),batch_label_files + batch_img, shape_list, batch_label_files = [],[],[] + + if len(batch_img) > 0: + yield np.concatenate(batch_img, 0), np.array(shape_list), batch_label_files + + def eval_icdar_2015(self,img_dir,batch_size): + dataloader = self.get_dataloader(img_dir,batch_size) + label_files =[] + evaluator = DetectionIoUEvaluator() + + inputs, outputs, allocations = get_io_bindings(self.engine) + gts =[] + preds=[] + all_boxes= [] + for i, data in enumerate(tqdm(dataloader,disable=False)): + batch_data, shape_list,batch_label = data + label_files.extend(batch_label) + post_result= self.batch_forward(inputs,outputs,allocations,batch_data,shape_list) + all_boxes.extend(post_result) + print("============start evel=========================") + for i, per_image_boxes in enumerate(all_boxes): + one_pred=[] + dt_boxes = per_image_boxes["points"] + for bbox in dt_boxes: + one_pred_res={} + one_pred_res["points"]=[tuple(x) for x in bbox.tolist()] + one_pred_res["text"]="text" + one_pred_res["ignore"] =False + one_pred.append(one_pred_res) + preds.append(one_pred) + label_file= label_files[i] + one_gt=[] + with open(label_file) as f: + lines = f.readlines() + for line in lines: + one_gt_res={} + line_label=line.strip().split(",")[:9] + x1,y1,x2,y2,x3,y3,x4,y4,label =line_label + gt_bbox= [(int(x1), int(y1)), (int(x2), int(y2)), (int(x3), int(y3)), (int(x4), int(y4))] + one_gt_res["points"]=gt_bbox + one_gt_res["text"]=label + if label=="###": + one_gt_res["ignore"] =True + else: + one_gt_res["ignore"] =False + one_gt.append(one_gt_res) + gts.append(one_gt) + + + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + return metrics + + def perf(self,warm_up,loop_count,batch_size): + inputs, outputs, allocations = get_io_bindings(self.engine) + if warm_up > 0: + print("\nWarm Start.") + for i in range(warm_up): + self.context.execute_v2(allocations) + print("Warm Done.") + torch.cuda.synchronize() + start_time = time.time() + for i in range(loop_count): + self.context.execute_v2(allocations) + torch.cuda.synchronize() + end_time = time.time() + forward_time = end_time - start_time + fps = loop_count * batch_size / forward_time + fps = round(fps,2) + return fps + + + def pre_process(self,img_file): + org_img = cv2.imread(img_file) + image = org_img.copy() + #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + letter_img, r, dw, dh= letterbox(image,self.configs["image_shape"]) + in_img = letter_img.copy() + #image = cv2.resize(image, (1280, 736)) + in_img = in_img.astype(np.float32) + in_img /= 255 + in_img =(in_img-0.456)/0.224 + return letter_img,in_img,org_img, r, dw, dh + + + + + + diff --git a/models/cv/ocr/dbnet/ixrt/util/eval_det_iou.py b/models/cv/ocr/dbnet/ixrt/util/eval_det_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..3b1a1702e66496f15ec8ff967ac8fe90ab7aec4f --- /dev/null +++ b/models/cv/ocr/dbnet/ixrt/util/eval_det_iou.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon +import glob +import os +""" +reference from : +https://github.com/MhLiao/DB/blob/3c32b808d4412680310d3d28eeb6a2d5bf1566c5/concern/icdar2015_eval/detection/iou.py#L8 +""" + + + +class DetectionIoUEvaluator(object): + def __init__(self, iou_constraint=0.5, area_precision_constraint=0.5): + self.iou_constraint = iou_constraint + self.area_precision_constraint = area_precision_constraint + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax') + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + + evaluationLog = "" + + for n in range(len(gt)): + points = gt[n]['points'] + dontCare = gt[n]['ignore'] + if not Polygon(points).is_valid: + continue + + gtPol = points + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += "GT polygons: " + str(len(gtPols)) + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + if not Polygon(points).is_valid: + continue + + detPol = points + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = Polygon(detPol).area + precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions + if (precision > self.area_precision_constraint): + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += "DET polygons: " + str(len(detPols)) + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 else "\n") + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG) + + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum: + if iouMat[gtNum, detNum] > self.iou_constraint: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({'gt': gtNum, 'det': detNum}) + detMatchedNums.append(detNum) + evaluationLog += "Match GT #" + \ + str(gtNum) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtPols) - len(gtDontCarePolsNum)) + numDetCare = (len(detPols) - len(detDontCarePolsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + + hmean = 0 if (precision + recall) == 0 else 2.0 * \ + precision * recall / (precision + recall) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + perSampleMetrics = { + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'detMatched': detMatched, + } + return perSampleMetrics + + def combine_results(self, results): + numGlobalCareGt = 0 + numGlobalCareDet = 0 + matchedSum = 0 + for result in results: + numGlobalCareGt += result['gtCare'] + numGlobalCareDet += result['detCare'] + matchedSum += result['detMatched'] + + methodRecall = 0 if numGlobalCareGt == 0 else float( + matchedSum) / numGlobalCareGt + methodPrecision = 0 if numGlobalCareDet == 0 else float( + matchedSum) / numGlobalCareDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \ + methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + +def read_label(label_file): + #gts = [] + one_gt = [] + with open(label_file) as f: + lines = f.readlines() + + + for line in lines: + one_res={} + cord,label = line.strip().split(",")[:8],line.strip().split(",")[-1] + cord=[int(x) for x in cord] + one_res["points"]=[(cord[0],cord[1]),(cord[2],cord[3]),(cord[4],cord[5]),(cord[6],cord[7])] + one_res["text"]=label + if label=="###": + one_res["ignore"] =True + else: + one_res["ignore"] =False + one_gt.append(one_res) + #gts.append(one_gt) + return one_gt + + + + +if __name__ == '__main__': + evaluator = DetectionIoUEvaluator() + + gt_files = glob.glob("/home/fangjian.hu/workspace/ixrt/test_data/MSRA/test_labels_icdar/*") + pred_path = "/home/fangjian.hu/workspace/ixrt/test_data/MSRA_pred_dt/" + + gts =[] + preds=[] + + for gt_file in gt_files: + label_name = os.path.split(gt_file)[-1] + pred_file = os.path.join(pred_path,label_name) + one_gt= read_label(gt_file) + one_pred= read_label(pred_file) + gts.append(one_gt) + preds.append(one_pred) + + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) + + + + + + + + + + + + + + # for item in data["label"]: + # print(item) + # if item["transcription"]!="###": + # print(item) + + + + + + + + + + # gts = [[{ + # 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + # 'text': 1234, + # 'ignore': False, + # }, { + # 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + # 'text': 5678, + # 'ignore': False, + # }]] + # preds = [[{ + # 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + # 'text': 123, + # 'ignore': False, + # }]] + # results = [] + # for gt, pred in zip(gts, preds): + # results.append(evaluator.evaluate_image(gt, pred)) + # metrics = evaluator.combine_results(results) + # print(metrics) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/README.md b/models/cv/semantic_segmentation/ddrnet/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ce594e2a8341dfb5823e34af80700550a54056f --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/README.md @@ -0,0 +1,66 @@ +# DDRNet (ixRT) + +## Model Description + +DDRNet (Dual Resolution Network) is a real-time semantic segmentation network that learns rich representations through bilateral detail preservation and deep aggregation for high-resolution image understanding. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Dataset: to download the dataset. + +### Install Dependencies + +```bash +pip3 install xtcocotools tqdm munkres onnxsim opencv-python==4.6.0.66 +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/ddrnet23.onnx +``` + +## Model Inference + +```bash +export DATASETS_DIR=/Path/to/cityscapes/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=./ +``` + +### FP16 + +```bash +# Test ACC (mIoU) +bash scripts/infer_ddrnet_fp16_accuracy.sh +# Test FPS +bash scripts/infer_ddrnet_fp16_performance.sh +``` + +### INT8 + +```bash +# Test ACC (mIoU) +bash scripts/infer_ddrnet_int8_accuracy.sh +# Test FPS +bash scripts/infer_ddrnet_int8_performance.sh +``` + +## Model Results + +| Model | BatchSize | Precision | FPS | mIoU(%) | mAcc(%) | +| ------ | --------- | --------- | ------- | ------- | ------- | +| DDRNet | 4 | FP16 | 98.278 | 12.8 | 25.8 | +| DDRNet | 4 | INT8 | 123.94 | 12.9 | 25.6 | \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/build_engine.py b/models/cv/semantic_segmentation/ddrnet/ixrt/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..5acad1d95173f735a75f72ed2e92b4972a44d2be --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/build_engine.py @@ -0,0 +1,68 @@ +import os +import json +import onnx +import logging +import argparse +import ctypes +from os.path import join, dirname, exists + +import tensorrt + +def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path) + tensorrt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") + +load_ixrt_plugin() + + +def build_engine_trtapi(config): + onnx_model = config.model + assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + + parser.parse_from_file(onnx_model) + if config.precision == "int8": + build_config.set_flag(tensorrt.BuilderFlag.INT8) + build_config.set_flag(tensorrt.BuilderFlag.FP16) + else: + build_config.set_flag(tensorrt.BuilderFlag.FP16) + + plan = builder.build_serialized_network(network, build_config) + with open(config.engine, "wb") as f: + f.write(plan) + + print("Build engine done!") + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default="ddrnet23.onnx") + parser.add_argument("--bsz", type=int, default=4, help="batch size") + parser.add_argument("--precision", type=str, choices=["float16", "int8"], default="int8", help="The precision of datatype") + parser.add_argument("--imgsz_h", type=int, default=1024, help="inference size h") + parser.add_argument("--imgsz_w", type=int, default=2048, help="inference size w") + # engine args + parser.add_argument("--engine", type=str, default=None) + # device + parser.add_argument( + "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4" + ) + + args = parser.parse_args() + return args + + +if __name__ == "__main__": + config = parse_args() + build_engine_trtapi(config) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/ci/prepare.sh b/models/cv/semantic_segmentation/ddrnet/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..e18dd4eff25fbf6adf526edff576c39246b6a67e --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/ci/prepare.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +pip3 install xtcocotools tqdm munkres onnxsim opencv-python==4.6.0.66 + +mkdir -p checkpoints +cp /root/data/checkpoints/ddrnet23.onnx checkpoints/ \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/deploy.py b/models/cv/semantic_segmentation/ddrnet/ixrt/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..6c68506dcc96fd3ef9af0f0417d8909a6ea4d72e --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/deploy.py @@ -0,0 +1,127 @@ +import os +import cv2 +import argparse +import numpy as np +import torch + +from utils import input_transform + +from tensorrt import IxRT +from ixrt.common import RuntimeConfig, RuntimeContext +from tensorrt.deploy.api import * + + +def create_runtime_from_model(args): + model = args.model + quant_file = args.quant_file + precision = args.precision + + config = RuntimeConfig() + config.input_shapes = [("inputx", [args.bsz, 3, args.imgsz_h, args.imgsz_w])] + config.device_idx = args.device + if precision == "int8": + assert os.path.isfile(quant_file), "Quant file must provided for int8 inferencing" + + config.runtime_context = RuntimeContext( + precision, + "nhwc", + use_gpu=True, + pipeline_sync=True, + input_types={"inputx": "float32"}, + output_types={"outputy": "float32"} + ) + runtime = IxRT.from_onnx(model, quant_file, config) + runtime.Init(runtime.config) + return runtime + + +def create_runtime_from_engine(engine): + runtime = IxRT() + runtime.LoadEngine(engine) + return runtime + + +def pre_process(img_file): + assert os.path.isfile(img_file), "The input file {img_file} must be existed!" + img = cv2.imread(img_file, cv2.IMREAD_COLOR) + img = input_transform( + img, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225] + ) + return img + + +def main(args): + print(args) + img_file = args.img_file + if args.engine is not None: + runtime = create_runtime_from_engine(args.engine) + else: + runtime = create_runtime_from_model(args) + + input_map = runtime.GetInputShape() + output_map = runtime.GetOutputShape() + print(f"input map is: {input_map}") + print(f"output map is: {output_map}") + + input_io_buffers = [] + output_io_buffers = [] + for name, shape in input_map.items(): + # 1. apply memory buffer for input of the shape, based on shape and padding + _shape, _padding = shape.dims, shape.padding + _shape = [i + j for i, j in zip(_shape, _padding)] + _shape = [_shape[0], *_shape[2:4], _shape[1]] + # currently we only support float32 as I/O + buffer = np.zeros(_shape, dtype=np.float32) + # 2. load image to the buffer, TODO batch load + img = pre_process(img_file) + print("image shape is:", img.shape) + + buffer[0, :, :, :3] = img + print(f"Allocated input buffer:{_shape}") + + # 3. put the buffer to a list + input_io_buffers.append([name, buffer, shape]) + + for name, shape in output_map.items(): + # 1. apply memory buffer for output of the shape + # output_buffer = np.zeros(shape.dims, dtype=np.float32) + bs, c, h, w = shape.dims + dims = [bs, h, w, c] + + output_buffer = np.zeros(dims, dtype=np.float32) + # 2. put the buffer to a list + output_io_buffers.append([name, output_buffer, shape]) + + runtime.LoadInput(input_io_buffers) + runtime.Execute() + runtime.FetchOutput(output_io_buffers) + + print(f"Test Achieved!") + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default="ddrnet23.onnx") + parser.add_argument("--quant_file", type=str, default=None, help="the json of quantization") + parser.add_argument("--bsz", type=int, default=4, help="batch size") + parser.add_argument("--precision", type=str, choices=["float16", "int8"], default="int8", help="The precision of datatype") + parser.add_argument("--warm_up", type=int, default=5, help="warm_up count") + parser.add_argument("--imgsz_h", type=int, default=1024, help="inference size h") + parser.add_argument("--imgsz_w", type=int, default=2048, help="inference size w") + # engine args + parser.add_argument("--engine", type=str, default=None) + parser.add_argument("--img_file", type=str, default=None) + # device + parser.add_argument( + "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4" + ) + + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/inference.py b/models/cv/semantic_segmentation/ddrnet/ixrt/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..bb178a2bd34dbdb7a2d19c6f56e1e6afb71c8efb --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/inference.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import glob +import json +import os +import sys +import time +import random +import ctypes +import numpy as np +from os.path import join, dirname, exists + +from tqdm import tqdm + +from utils import Dataset, get_confusion_matrix +import tensorrt +import cuda.cuda as cuda +import cuda.cudart as cudart + +def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path) + tensorrt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") + +load_ixrt_plugin() + +def create_engine_context(config): + engine_path = config.engine_file + datatype = tensorrt.DataType.FLOAT + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + with open(engine_path, "rb") as f, tensorrt.Runtime(logger) as runtime: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + + +def setup_io_bindings(engine, context): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = context.get_binding_shape(i) + + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + for s in shape: + size *= s + err, allocation = cudart.cudaMalloc(size) + assert err == cudart.cudaError_t.cudaSuccess + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size, + } + allocations.append(allocation) + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + return inputs, outputs, allocations + +def check_target(inference, target): + satisfied = False + if inference > target: + satisfied = True + return satisfied + + +def test_mIoU_mAcc(dataset, config): + + confusion_matrix = np.zeros((config.num_classes, config.num_classes)) + + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + + engine, context = create_engine_context(config) + inputs, outputs, allocations = setup_io_bindings(engine, context) + + run_times = [] + + for i, element in tqdm(enumerate(dataset), desc="Testing mIoU and mAcc"): + start_time = time.time() + img, label, pad_size, name = element + img = np.ascontiguousarray(img.transpose((0,3,1,2))) + b, c, h, w = img.shape + + output = np.zeros([b, 32, h, w], outputs[0]["dtype"]) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], img, img.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + pred = output[:, :config.num_classes, :, :] + # flip test + if config.flip: + flip_img = img.copy()[:, :, :, ::-1] + + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], img, img.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_v2(allocations) + err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + flip_pred = output[:, :config.num_classes, :, :] + + pred += flip_pred + out = np.exp(pred * 0.5) + else: + out = np.exp(pred) + + out = out.transpose((0,2,3,1)) + + for j in range(b): + confusion_matrix += get_confusion_matrix( + label[j:j+1], + out[j:j+1], + pad_size[j], + config.num_classes, + config.ignore_label + ) + + end_time = time.time() + run_times.append(end_time - start_time) + + num_imgs = i * config.bsz + if num_imgs % 100 == 0: + print(f"[INFO] processing: {num_imgs} images") + pos = confusion_matrix.sum(1) + res = confusion_matrix.sum(0) + tp = np.diag(confusion_matrix) + IoU_array = (tp / np.maximum(1.0, pos + res - tp)) + mean_IoU = IoU_array.mean() + print("[INFO] mIoU: %.4f" % (mean_IoU)) + + pos = confusion_matrix.sum(1) + res = confusion_matrix.sum(0) + tp = np.diag(confusion_matrix) + pixel_acc = tp.sum() / pos.sum() + mean_acc = (tp / np.maximum(1.0, pos)).mean() + IoU_array = (tp / np.maximum(1.0, pos + res - tp)) + mean_IoU = IoU_array.mean() + + # Calculate FPS + run_times.remove(max(run_times)) + run_times.remove(min(run_times)) + avg_time = sum(run_times) / len(run_times) + fps = 1. / avg_time + print(f"Executing Done, Time: {avg_time}, FPS: {fps}, mIoU: {mean_IoU}, mAcc: {mean_acc}") + print(f"Class IoU:") + print(f"{IoU_array}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["mIoU"] = round(mean_IoU, 3) + metricResult["metricResult"]["mAcc"] = round(mean_acc, 3) + print(metricResult) + return mean_IoU, mean_acc + + +def test_fps(config, loop_count, dataset): + + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + + engine, context = create_engine_context(config) + inputs, outputs, allocations = setup_io_bindings(engine, context) + + run_times = [] + + if config.warm_up > 0: + print("\nWarm Start.") + for i in range(config.warm_up): + context.execute_v2(allocations) + print("Warm Done.") + + batch_data0 = dataset[0] + for i in range(loop_count): + img, label, pad_size, name = batch_data0 + b, h, w, c = img.shape + output = np.zeros([b, 32, h, w], outputs[0]["dtype"]) + img = np.ascontiguousarray(img.transpose((0,3,1,2))) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], img, img.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + start_time = time.time() + context.execute_v2(allocations) + end_time = time.time() + err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + temp_time = end_time - start_time + fps = b / temp_time + print(f"time: {temp_time}, fps: {fps}") + run_times.append(temp_time) + + # Calculate FPS + run_times.remove(max(run_times)) + run_times.remove(min(run_times)) + + avg_time = sum(run_times) / len(run_times) + fps = b / avg_time + print(f"Executing {loop_count} done, Time: {avg_time}, FPS: {fps}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["FPS"] = round(fps, 3) + print(metricResult) + return fps + + +def main(config): + + num_samples = 1 + bsz = config.bsz + if config.loop_count > 0: + num_samples = bsz * config.loop_count + num_batch = (num_samples + bsz - 1) // bsz + + dataset = Dataset( + root=config.dataset_dir, + list_path=config.list_path, + batch_size=config.bsz, + ignore_label=255 + ) + + if config.test_mode == "MIOU": + mIoU, mAcc = test_mIoU_mAcc(dataset, config) + status_mIoU_mAcc = check_target(mIoU, config.target_mIoU) and check_target(mAcc, config.target_mAcc) + sys.exit(int(not (status_mIoU_mAcc))) + + elif config.test_mode == "FPS": + # Warm up + fps = test_fps(config, config.loop_count, dataset) + status_fps = check_target(fps, config.target_fps) + sys.exit(int(not (status_fps))) + + +def parse_config(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_type", + type=str, + default="DDRNET", + help="The semantic segmentation(ddrnet)", + ) + parser.add_argument("--engine_file", type=str, help="engine file path") + parser.add_argument("--test_mode", type=str, default="MIOU", help="FPS MIOU") + parser.add_argument( + "--dataset_dir", + type=str, + default="/root/data/datasets", + help="The directory of dataset(cityscapes)", + ) + parser.add_argument( + "--list_path", + type=str, + default="/root/data/datasets/cityscapes/val.lst", + help="The val name list of dataset(cityscapes)", + ) + parser.add_argument("--warm_up", type=int, default=5, help="warm_up count") + parser.add_argument("--flip", action='store_true', help="Flip test") + parser.add_argument("--bsz", type=int, default=4, help="batch size") + parser.add_argument("--num_classes", type=int, default=19, help="the category of dataset") + parser.add_argument("--ignore_label", type=int, default=255, help="the category of not used in calculate confusion matrix") + parser.add_argument("--imgsz_h", type=int, default=1024, help="inference size h") + parser.add_argument("--imgsz_w", type=int, default=2048, help="inference size w") + parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs") + parser.add_argument("--target_fps", type=float, default=-1.0) + parser.add_argument("--target_mIoU", type=float, default=-1.0) + parser.add_argument("--target_mAcc", type=float, default=-1.0) + parser.add_argument("--loop_count", type=int, default=12) + parser.add_argument( + "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4" + ) + + config = parser.parse_args() + return config + + +if __name__ == "__main__": + config = parse_config() + main(config) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/quant.py b/models/cv/semantic_segmentation/ddrnet/ixrt/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..b5347f66fe861ee176434f89d56b31c5db30de7c --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/quant.py @@ -0,0 +1,70 @@ +import os +import cv2 +import random +import argparse +import numpy as np +from random import shuffle +from utils import input_transform +from tensorrt.deploy import static_quantize + +import torch +import torchvision.datasets +from torch.utils.data import DataLoader + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default="ddrnet23.onnx") + parser.add_argument("--dataset_dir", type=str, default="/root/data/datasets/cityscapes") + parser.add_argument("--list_path", type=str, default="/root/data/datasets/cityscapes/val.lst", help="The path of val list.") + parser.add_argument("--save_dir", type=str, help="quant file", default=None) + args = parser.parse_args() + return args + + +def getdataloader(datadir, list_path, step=32, batch_size=4): + num = step * batch_size + + img_list = [line.strip().split()[0] for line in open(list_path)] + val_list = [os.path.join(datadir, x) for x in img_list] + random.shuffle(val_list) + pic_list = val_list[:num] + + dataloader = [] + # imgsz = (1024, 2048) + for file_path in pic_list: + img = cv2.imread(file_path, cv2.IMREAD_COLOR) + img = input_transform( + img, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225] + ) + img = img.transpose((2, 0, 1)) + dataloader.append(img) + + calibration_dataset = dataloader + calibration_dataloader = DataLoader( + calibration_dataset, + shuffle=True, + batch_size=batch_size, + drop_last=True + ) + return calibration_dataloader + + +args = parse_args() +model_name = os.path.basename(args.model) +model_name = model_name.rsplit(".", maxsplit=1)[0] + +out_dir = os.path.dirname(args.model) +dataloader = getdataloader(args.dataset_dir, args.list_path) + +static_quantize(args.model, + calibration_dataloader=dataloader, + save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"), + save_quant_params_path=os.path.join(out_dir, f"quantized_ddrnet23.json"), + observer="percentile", + analyze=True, + quant_format="qdq", + data_preprocess=lambda x: x.to("cuda"), + ) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_accuracy.sh b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..e5d96ce25ad3ce55973441f78f755756df621bb6 --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_accuracy.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -e + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +MODEL_NAME="ddrnet" +BSZ=4 +PRECISION="float16" +DEVICE=0 +FORCE_BUILD=0 +TGT_0=-1 +TGT_1=-1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + -p | --precision) PRECISION=${arguments[index]};; + -d | --device) DEVICE=${arguments[index]};; + --bs) BSZ=${arguments[index]};; + --tgt_iou) TGT_0=${arguments[index]};; + --tgt_acc) TGT_1=${arguments[index]};; + -f | --force) FORCE_BUILD=1;; + esac +done + +CHECKPOINTS_DIR="./checkpoints" +DATASET_DIR="/root/data/datasets" +LIST_PATH="/root/data/datasets/cityscapes/val.lst" +RUN_DIR="${RUN_DIR:-.}" +ORIGINE_MODEL="${CHECKPOINTS_DIR}/ddrnet23.onnx" + +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} +echo; + +function run_cmd() +{ + echo "[CMD]: $@" + eval $@ +} + +step=1 + +# Simplify Model +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + run_cmd python3 ${RUN_DIR}/sim_onnx_model.py \ + --raw_model_path ${ORIGINE_MODEL} \ + --sim_model_path ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi +let step++ +echo; + +# Build Engine +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ ${FORCE_BUILD} -eq 1 ] && [ -e ${ENGINE_FILE} ];then + rm ${ENGINE_FILE} +fi +echo "Building engine(${PRECISION})" +if [ -e ${ENGINE_FILE} ];then + echo " "Build Engine Skip, ${ENGINE_FILE} has been existed +else + run_cmd python3 ${RUN_DIR}/build_engine.py \ + --model ${SIM_MODEL} \ + --bsz ${BSZ} \ + --precision ${PRECISION} \ + --engine ${ENGINE_FILE} \ + --device ${DEVICE} + echo " "Generate Engine ${ENGINE_FILE} +fi +let step++ +echo; + +# Inference +echo [STEP ${step}] : Inference +run_cmd python3 ${RUN_DIR}/inference.py \ + --model_type "DDRNET23" \ + --engine_file ${ENGINE_FILE} \ + --test_mode MIOU \ + --dataset_dir ${DATASET_DIR} \ + --list_path ${LIST_PATH} \ + --flip \ + --bsz ${BSZ} \ + --target_mIoU ${TGT_0} \ + --target_mAcc ${TGT_1} \ + --loop_count -1 \ + --device ${DEVICE}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_performance.sh b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..42c4b2c751dc7d7f8cc4743e0fb3374c46b7f2c6 --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_fp16_performance.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -e + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +MODEL_NAME="ddrnet" +BSZ=4 +PRECISION="float16" +DEVICE=0 +FORCE_BUILD=0 +TGT=1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + -p | --precision) PRECISION=${arguments[index]};; + -d | --device) DEVICE=${arguments[index]};; + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + -f | --force) FORCE_BUILD=1;; + esac +done + +CHECKPOINTS_DIR="./checkpoints" +DATASET_DIR="/root/data/datasets" +LIST_PATH="/root/data/datasets/cityscapes/val.lst" +RUN_DIR="${RUN_DIR:-.}" +ORIGINE_MODEL="${CHECKPOINTS_DIR}/ddrnet23.onnx" + +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} +echo; + +function run_cmd() +{ + echo "[CMD]: $@" + eval $@ +} + +step=1 + +# Simplify Model +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + run_cmd python3 ${RUN_DIR}/sim_onnx_model.py \ + --raw_model_path ${ORIGINE_MODEL} \ + --sim_model_path ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi +let step++ +echo; + +# Build Engine +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ ${FORCE_BUILD} -eq 1 ] && [ -e ${ENGINE_FILE} ];then + rm ${ENGINE_FILE} +fi +echo "Building engine(${PRECISION})" +if [ -e ${ENGINE_FILE} ];then + echo " "Build Engine Skip, ${ENGINE_FILE} has been existed +else + run_cmd python3 ${RUN_DIR}/build_engine.py \ + --model ${SIM_MODEL} \ + --bsz ${BSZ} \ + --precision ${PRECISION} \ + --engine ${ENGINE_FILE} \ + --device ${DEVICE} + echo " "Generate Engine ${ENGINE_FILE} +fi +let step++ +echo; + +# Inference +echo [STEP ${step}] : Inference +run_cmd python3 ${RUN_DIR}/inference.py \ + --model_type "DDRNET23" \ + --engine_file ${ENGINE_FILE} \ + --test_mode FPS \ + --dataset_dir ${DATASET_DIR} \ + --list_path ${LIST_PATH} \ + --target_fps ${TGT} \ + --loop_count 12 \ + --device ${DEVICE} + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_accuracy.sh b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3e59bb86165629138bd4d6243ac9aa17d30df5f --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_accuracy.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -e + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +MODEL_NAME="ddrnet" +BSZ=4 +PRECISION="int8" +DEVICE=0 +FORCE_BUILD=0 +TGT_0=-1 +TGT_1=-1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + -p | --precision) PRECISION=${arguments[index]};; + -d | --device) DEVICE=${arguments[index]};; + --bs) BSZ=${arguments[index]};; + --tgt_iou) TGT_0=${arguments[index]};; + --tgt_acc) TGT_1=${arguments[index]};; + -f | --force) FORCE_BUILD=1;; + esac +done + +CHECKPOINTS_DIR="./checkpoints" +DATASET_DIR="/root/data/datasets" +LIST_PATH="/root/data/datasets/cityscapes/val.lst" +RUN_DIR="${RUN_DIR:-.}" +ORIGINE_MODEL="${CHECKPOINTS_DIR}/ddrnet23.onnx" + +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} +echo; + +function run_cmd() +{ + echo "[CMD]: $@" + eval $@ +} + +step=1 + +# Simplify Model +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + run_cmd python3 ${RUN_DIR}/sim_onnx_model.py \ + --raw_model_path ${ORIGINE_MODEL} \ + --sim_model_path ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi +let step++ +echo; + +# Quant Model +if [ $PRECISION == "int8" ];then + echo [STEP ${step}] : Quant Model + QUANT_MODEL=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}_sim.onnx + if [ -f ${QUANT_MODEL} ];then + echo " "Quant Model Skip, ${QUANT_MODEL} has been existed + else + run_cmd python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --dataset_dir ${DATASET_DIR}/cityscapes \ + --save_dir ${CHECKPOINTS_DIR} + echo " "Generate ${QUANT_MODEL} + fi + SIM_MODEL=${QUANT_MODEL} + let step++ + echo; +fi + +# Build Engine +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ ${FORCE_BUILD} -eq 1 ] && [ -e ${ENGINE_FILE} ];then + rm ${ENGINE_FILE} +fi +echo "Building engine(${PRECISION})" +if [ -e ${ENGINE_FILE} ];then + echo " "Build Engine Skip, ${ENGINE_FILE} has been existed +else + run_cmd python3 ${RUN_DIR}/build_engine.py \ + --model ${SIM_MODEL} \ + --bsz ${BSZ} \ + --precision ${PRECISION} \ + --engine ${ENGINE_FILE} \ + --device ${DEVICE} + echo " "Generate Engine ${ENGINE_FILE} +fi +let step++ +echo; + +# Inference +echo [STEP ${step}] : Inference +run_cmd python3 ${RUN_DIR}/inference.py \ + --model_type "DDRNET23" \ + --engine_file ${ENGINE_FILE} \ + --test_mode MIOU \ + --dataset_dir ${DATASET_DIR} \ + --list_path ${LIST_PATH} \ + --flip \ + --bsz ${BSZ} \ + --target_mIoU ${TGT_0} \ + --target_mAcc ${TGT_1} \ + --loop_count -1 \ + --device ${DEVICE}; check_status + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_performance.sh b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..36fecd59d4b94dca203d5b006811fbd24603ffa0 --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/scripts/infer_ddrnet_int8_performance.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +set -e + +EXIT_STATUS=0 +check_status() +{ + if ((${PIPESTATUS[0]} != 0));then + EXIT_STATUS=1 + fi +} + +MODEL_NAME="ddrnet" +BSZ=4 +PRECISION="int8" +DEVICE=0 +FORCE_BUILD=0 +TGT=1 + +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + -p | --precision) PRECISION=${arguments[index]};; + -d | --device) DEVICE=${arguments[index]};; + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + -f | --force) FORCE_BUILD=1;; + esac +done + +CHECKPOINTS_DIR="./checkpoints" +DATASET_DIR="/root/data/datasets" +LIST_PATH="/root/data/datasets/cityscapes/val.lst" +RUN_DIR="${RUN_DIR:-.}" +ORIGINE_MODEL="${CHECKPOINTS_DIR}/ddrnet23.onnx" + +echo ====================== Model Info ====================== +echo Model Name : ${MODEL_NAME} +echo Onnx Path : ${ORIGINE_MODEL} +echo; + +function run_cmd() +{ + echo "[CMD]: $@" + eval $@ +} + +step=1 + +# Simplify Model +echo [STEP ${step}] : Simplify Model +SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx +if [ -f ${SIM_MODEL} ];then + echo " "Simplify Model, ${SIM_MODEL} has been existed +else + run_cmd python3 ${RUN_DIR}/sim_onnx_model.py \ + --raw_model_path ${ORIGINE_MODEL} \ + --sim_model_path ${SIM_MODEL} + echo " "Generate ${SIM_MODEL} +fi +let step++ +echo; + +# Quant Model +if [ $PRECISION == "int8" ];then + echo [STEP ${step}] : Quant Model + QUANT_MODEL=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}_sim.onnx + if [ -f ${QUANT_MODEL} ];then + echo " "Quant Model Skip, ${QUANT_MODEL} has been existed + else + run_cmd python3 ${RUN_DIR}/quant.py \ + --model ${SIM_MODEL} \ + --dataset_dir ${DATASET_DIR}/cityscapes \ + --save_dir ${CHECKPOINTS_DIR} + echo " "Generate ${QUANT_MODEL} + fi + SIM_MODEL=${QUANT_MODEL} + let step++ + echo; +fi + +# Build Engine +echo [STEP ${step}] : Build Engine +ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine +if [ ${FORCE_BUILD} -eq 1 ] && [ -e ${ENGINE_FILE} ];then + rm ${ENGINE_FILE} +fi +echo "Building engine(${PRECISION})" +if [ -e ${ENGINE_FILE} ];then + echo " "Build Engine Skip, ${ENGINE_FILE} has been existed +else + run_cmd python3 ${RUN_DIR}/build_engine.py \ + --model ${SIM_MODEL} \ + --bsz ${BSZ} \ + --precision ${PRECISION} \ + --engine ${ENGINE_FILE} \ + --device ${DEVICE} + echo " "Generate Engine ${ENGINE_FILE} +fi +let step++ +echo; + +# Inference +echo [STEP ${step}] : Inference +run_cmd python3 ${RUN_DIR}/inference.py \ + --model_type "DDRNET23" \ + --engine_file ${ENGINE_FILE} \ + --test_mode FPS \ + --dataset_dir ${DATASET_DIR} \ + --list_path ${LIST_PATH} \ + --target_fps ${TGT} \ + --loop_count 12 \ + --device ${DEVICE} + +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/sim_onnx_model.py b/models/cv/semantic_segmentation/ddrnet/ixrt/sim_onnx_model.py new file mode 100644 index 0000000000000000000000000000000000000000..98aa36c21df3b4ab7194236db9b5006a84ad1dcb --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/sim_onnx_model.py @@ -0,0 +1,17 @@ +import onnx +import argparse +from onnxsim import simplify + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--raw_model_path", type=str) + parser.add_argument("--sim_model_path", type=str) + args = parser.parse_args() + return args + + +args = parse_args() +onnx_model = onnx.load(args.raw_model_path) +model_simp, check = simplify(onnx_model) +onnx.save(model_simp, args.sim_model_path) +print('Simplify onnx Done.') diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/utils/__init__.py b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6f0a54fef9f0be1c2930e394c114fc24d32c9be7 --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/__init__.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# coding=utf-8 + +import numpy as np +from .dataset import Dataset +from .metrics import get_confusion_matrix + + +def input_transform(image, mean, std): + image = image.astype(np.float32)[:, :, ::-1] + image = image / 255.0 + image -= mean + image /= std + return image diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/utils/dataset.py b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2d137c4bfdaa3c37ead38a047f7ad814401e6a4b --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/dataset.py @@ -0,0 +1,139 @@ +import os +import cv2 +import numpy as np +from math import ceil +from tqdm import tqdm + + +class Dataset: + def __init__(self, + root, + list_path, + batch_size=4, + num_classes=19, + ignore_label=255, + base_size=2048, + crop_size=(512, 1024), + downsample_rate=1, + scale_factor=16, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + + self.root = root + self.list_path = list_path + self.batch_size = batch_size + self.num_classes = num_classes + self.mean = mean + self.std = std + self.downsample_rate = downsample_rate + + self.img_list = [line.strip().split() for line in open(list_path)] + self.files = self.read_files() + self.num_batches = ceil(len(self.files) / self.batch_size) + + self.label_mapping = {-1: ignore_label, 0: ignore_label, + 1: ignore_label, 2: ignore_label, + 3: ignore_label, 4: ignore_label, + 5: ignore_label, 6: ignore_label, + 7: 0, 8: 1, 9: ignore_label, + 10: ignore_label, 11: 2, 12: 3, + 13: 4, 14: ignore_label, 15: ignore_label, + 16: ignore_label, 17: 5, 18: ignore_label, + 19: 6, 20: 7, 21: 8, 22: 9, 23: 10, 24: 11, + 25: 12, 26: 13, 27: 14, 28: 15, + 29: ignore_label, 30: ignore_label, + 31: 16, 32: 17, 33: 18} + + self.batch_images, self.batch_labels, self.batch_sizes, self.batch_names = self.batching() + + def read_files(self): + files = [] + for i, item in enumerate(self.img_list): + image_path, label_path = item + name = os.path.splitext(os.path.basename(label_path))[0] + files.append({ + "img": image_path, + "label": label_path, + "name": name, + "weight": 1 + }) + # if i == 4: + # break + return files + + def input_transform(self, image): + image = image.astype(np.float32)[:, :, ::-1] + image = image / 255.0 + image -= self.mean + image /= self.std + return image + + def label_transform(self, label): + temp = label.copy() + for k, v in self.label_mapping.items(): + label[temp == k] = v + return np.array(label).astype('int32') + + def gen_sample(self, image, label): + + image = self.input_transform(image) + label = self.label_transform(label) + + if self.downsample_rate != 1: + label = cv2.resize( + label, + None, + fx=self.downsample_rate, + fy=self.downsample_rate, + interpolation=cv2.INTER_NEAREST + ) + return image, label + + def _preprocess(self, index): + item = self.files[index] + name = item["name"] + image = cv2.imread(os.path.join(self.root,'cityscapes',item["img"]), + cv2.IMREAD_COLOR) + size = image.shape + label = cv2.imread(os.path.join(self.root,'cityscapes',item["label"]), + cv2.IMREAD_GRAYSCALE) + image, label = self.gen_sample(image, label) + return image.copy(), label.copy(), np.array(size), name + + def __len__(self): + return self.num_batches + + def __getitem__(self, index): + return (self.batch_images[index], self.batch_labels[index], self.batch_sizes[index], self.batch_names[index]) + + def batching(self): + all_images = [] + all_labels = [] + all_sizes = [] + all_names = [] + + num_batches = self.num_batches + batch_size = self.batch_size + for i in tqdm(range(len(self.files)), desc="Loading Cityscapes Dataset"): + image, label, size, name = self._preprocess(i) + all_images.append(image) + all_labels.append(label) + all_sizes.append(size) + all_names.append(name) + + batch_images = [] + batch_labels = [] + batch_sizes = [] + batch_names = [] + + for j in range(num_batches): + start = j * batch_size + if j == num_batches - 1: + end = None + else: + end = (j + 1) * batch_size + batch_images.append(np.stack(all_images[start:end])) + batch_labels.append(np.stack(all_labels[start:end])) + batch_sizes.append(np.stack(all_sizes[start:end])) + batch_names.append(all_names[start:end]) + return (batch_images, batch_labels, batch_sizes, batch_names) diff --git a/models/cv/semantic_segmentation/ddrnet/ixrt/utils/metrics.py b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..f0bdfcdf2093b0c2a542dd3ad0f161b8c4ee4ff4 --- /dev/null +++ b/models/cv/semantic_segmentation/ddrnet/ixrt/utils/metrics.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# coding=utf-8 + +""" +Define function to build confusion_matrix. +""" + +import numpy as np + + +def get_confusion_matrix(label, pred, size, num_class=19, ignore=-1): + """ + Calcute the confusion matrix by given label and pred + """ + output = pred + seg_pred = np.asarray(np.argmax(output, axis=3), dtype=np.uint8) + + seg_gt = np.asarray(label[:, :size[-3], :size[-2]], dtype=np.int32) + + ignore_index = seg_gt != ignore + seg_gt = seg_gt[ignore_index] + seg_pred = seg_pred[ignore_index] + + index = (seg_gt * num_class + seg_pred).astype('int32') + label_count = np.bincount(index) + confusion_matrix = np.zeros((num_class, num_class)) + + for i_label in range(num_class): + for i_pred in range(num_class): + cur_index = i_label * num_class + i_pred + if cur_index < len(label_count): + confusion_matrix[i_label, + i_pred] = label_count[cur_index] + return confusion_matrix + + +def get_confusion_matrix_batch(label, pred, size, num_class=19, ignore=-1): + """ + Calcute the confusion matrix by given label and pred in one batch. + Arguments: + label: (batch_size, h, w) + pred: (batch_size, h, w, c) + size: (batch_size, 2) + """ + batch_size, h, w, c = pred.shape + confusion_matrix = np.zeros((num_class, num_class)) + for i in range(batch_size): + confusion_matrix += get_confusion_matrix( + label[i], + pred[i:i+1], + size[i], + 19, + 255 + ) + return confusion_matrix diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/README.md b/models/speech/speech_recognition/deepspeech2/ixrt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a903795df8bc04020eacb146358f6e45e5df5bb6 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/README.md @@ -0,0 +1,74 @@ +# DeepSpeech2 (ixRT) + +## Model Description + +DeepSpeech2 is an end-to-end speech recognition model based on RNNs and CTC decoding, developed by Baidu. It uses CNN for acoustic feature extraction followed by RNN encoders and CTC decoder. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.06 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Dataset: LibriSpeech + +### Install Dependencies + +Contact the Iluvatar administrator to get the missing packages: +- paddlepaddle-*.whl + +```bash +pip3 install librosa psutil pysoundfile pytest requests tensorboardX editdistance textgrid onnxsim paddlespeech_ctcdecoders paddleaudio paddlespeech +pip3 install numpy==1.23.5 +``` + +### Model Conversion + +```bash +mkdir checkpoints +cd checkpoints +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/deepspeech2.onnx +wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/common_crawl_00.prune01111.trie.klm + + +git clone https://gitee.com/deep-spark/iluvatar-corex-ixrt.git --depth=1 + +OPTIMIER_FILE=iluvatar-corex-ixrt/tools/optimizer/optimizer.py +echo "Build engine!" +python3 modify_model_to_dynamic.py --static_onnx checkpoints/deepspeech2.onnx --dynamic_onnx checkpoints/deepspeech2_dynamic.onnx +python3 ${OPTIMIER_FILE} --onnx checkpoints/deepspeech2_dynamic.onnx --model_type rnn --not_sim +python3 build_engine.py \ + --model_name deepspeech2 \ + --onnx_path checkpoints/deepspeech2_dynamic_end.onnx \ + --engine_path checkpoints/deepspeech2.engine + +``` + +## Model Inference + +```bash +export DATASETS_DIR=/path/to/LibriSpeech/ +export CHECKPOINTS_DIR=./checkpoints +export RUN_DIR=./ +``` + +### FP16 + +```bash +# Test ACC (WER) +bash scripts/infer_deepspeech2_fp16_accuracy.sh +# Test FPS +bash scripts/infer_deepspeech2_fp16_performance.sh +``` + +## Model Results + +| Model | BatchSize | Precision | ThroughPut | WER(%) | +| ------------ | --------- | --------- | ------- | ------ | +| DeepSpeech2 | 1 | FP16 | 1584.153 | 5.8 | diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/build_engine.py b/models/speech/speech_recognition/deepspeech2/ixrt/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ff2a4c1747e83eff4479095e18c179be6870f0 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/build_engine.py @@ -0,0 +1,82 @@ +import os +import json +import onnx +import logging +import argparse +import ctypes +import tensorrt +from tensorrt import Dims +from load_ixrt_plugin import load_ixrt_plugin + +load_ixrt_plugin() + +def parse_args(): + parser = argparse.ArgumentParser(description="Build tensorrt engine of deepspeech2") + parser.add_argument("--model_name", type=str, required=True, help="model name deepspeech2") + parser.add_argument("--onnx_path", type=str, required=True, help="The onnx path") + parser.add_argument("--bsz", type=int, default=1, help="batch size") + parser.add_argument("--input_size", type=tuple, default=(-1, 161), help="inference size") + parser.add_argument("--engine_path", type=str, required=True, help="engine path to save") + parser.add_argument( "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4") + + args = parser.parse_args() + return args + + +def build_engine_trtapi_dynamicshape(args): + onnx_model = args.onnx_path + assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + + profile = builder.create_optimization_profile() + + profile.set_shape( + "input", Dims([1, 100, 161]), Dims([1, 1193, 161]), Dims([1, 3494, 161]) + ) + + build_config.add_optimization_profile(profile) + + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + + parser.parse_from_file(onnx_model) + build_config.set_flag(tensorrt.BuilderFlag.FP16) + + # set dynamic + input_tensor = network.get_input(0) + input_tensor.shape = Dims([1, -1, 161]) + + plan = builder.build_serialized_network(network, build_config) + with open(args.engine_path, "wb") as f: + f.write(plan) + + print("Build dynamic shape engine done!") + + +def build_engine_trtapi_staticshape(args): + onnx_model = args.onnx_path + assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!" + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + + parser.parse_from_file(onnx_model) + build_config.set_flag(tensorrt.BuilderFlag.FP16) + + plan = builder.build_serialized_network(network, build_config) + with open(args.engine_path, "wb") as f: + f.write(plan) + + print("Build static shape engine done!") + + +if __name__ == "__main__": + args = parse_args() + build_engine_trtapi_dynamicshape(args) + # build_engine_trtapi_staticshape(args) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/ci/prepare.sh b/models/speech/speech_recognition/deepspeech2/ixrt/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..e6a6de447a700f473e393ba0fb88ed371814d2a8 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/ci/prepare.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +pip3 install librosa psutil pysoundfile pytest requests tensorboardX editdistance textgrid onnxsim paddlespeech_ctcdecoders paddleaudio paddlespeech +pip3 install numpy==1.23.5 + +mkdir -p checkpoints +cp /root/data/checkpoints/deepspeech2.onnx checkpoints/ +cp /root/data/checkpoints/common_crawl_00.prune01111.trie.klm checkpoints/ + + +OPTIMIER_FILE=/root/data/3rd_party/iluvatar-corex-ixrt/tools/optimizer/optimizer.py +echo "Build engine!" +python3 modify_model_to_dynamic.py --static_onnx checkpoints/deepspeech2.onnx --dynamic_onnx checkpoints/deepspeech2_dynamic.onnx +python3 ${OPTIMIER_FILE} --onnx checkpoints/deepspeech2_dynamic.onnx --model_type rnn --not_sim +python3 build_engine.py \ + --model_name deepspeech2 \ + --onnx_path checkpoints/deepspeech2_dynamic_end.onnx \ + --engine_path checkpoints/deepspeech2.engine diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/convert_weights.py b/models/speech/speech_recognition/deepspeech2/ixrt/convert_weights.py new file mode 100644 index 0000000000000000000000000000000000000000..16131d6a9189e2fef131ae124b250fca71e36144 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/convert_weights.py @@ -0,0 +1,141 @@ +import os +import onnx +import argparse +import numpy as np +from onnx import TensorProto, numpy_helper, helper + +def parse_args(): + parser = argparse.ArgumentParser(description="Convert the weight of lstm in model.") + parser.add_argument("--input_onnx", type=str, default="/home/yanlong.hao/DeepSpeech2/ixrt-modelzoo/data/checkpoints/deepspeech2/deepspeech2_part.onnx") + parser.add_argument("--output_onnx", type=str, default="/home/yanlong.hao/DeepSpeech2/ixrt-modelzoo/data/checkpoints/deepspeech2/deepspeech2.onnx") + + args = parser.parse_args() + return args + + +def convert_weights(args): + onnx_model = onnx.load(args.input_onnx) + graph = onnx_model.graph + node = graph.node + initializer = graph.initializer + + for i in range(len(node)): + if node[i].op_type == "LSTM": + count = 0 + for t in node[i].input: + if not t: + count += 1 + print("count: ", count) + for _ in range(count): + node[i].input.remove("") + + hidden_size = 0 + for j in range(len(node[i].attribute)): + if node[i].attribute[j].name == "hidden_size": + hidden_size = node[i].attribute[j].i + + w_name = node[i].input[1] + r_name = node[i].input[2] + b_name = node[i].input[3] + + w_data = None + r_data = None + b_data = None + + for data in initializer: + if data.name == node[i].input[1]: + dims = list(data.dims).copy() + dims_A = dims.copy() + w_origin_data = np.frombuffer(data.raw_data, dtype=np.float32) + W_save = np.transpose(w_origin_data.reshape(dims), [0, 2, 1]) + w1 = W_save[0, :, :hidden_size].reshape(-1) + w2 = W_save[0, :, hidden_size : hidden_size * 2].reshape(-1) + w3 = W_save[0, :, hidden_size * 2 : hidden_size * 3].reshape(-1) + w4 = W_save[0, :, hidden_size * 3 : hidden_size * 4].reshape(-1) + + w_r1 = W_save[1, :, :hidden_size].reshape(-1) + w_r2 = W_save[1, :, hidden_size : hidden_size * 2].reshape(-1) + w_r3 = W_save[1, :, hidden_size * 2 : hidden_size * 3].reshape(-1) + w_r4 = W_save[1, :, hidden_size * 3 : hidden_size * 4].reshape(-1) + + w_data = np.concatenate([w1, w2, w3, w4, w_r1, w_r2, w_r3, w_r4]) + print("w_data shape: ", w_data.shape) + + if data.name == node[i].input[2]: + dims = list(data.dims).copy() + dims_B = dims.copy() + r_origin_data = np.frombuffer(data.raw_data, dtype=np.float32) + R_save = np.transpose(r_origin_data.reshape(dims), [0, 2, 1]) + r1 = R_save[0, :, :hidden_size].reshape(-1) + r2 = R_save[0, :, hidden_size : hidden_size * 2].reshape(-1) + r3 = R_save[0, :, hidden_size * 2 : hidden_size * 3].reshape(-1) + r4 = R_save[0, :, hidden_size * 3 : hidden_size * 4].reshape(-1) + + r_r1 = R_save[1, :, :hidden_size].reshape(-1) + r_r2 = R_save[1, :, hidden_size : hidden_size * 2].reshape(-1) + r_r3 = R_save[1, :, hidden_size * 2 : hidden_size * 3].reshape(-1) + r_r4 = R_save[1, :, hidden_size * 3 : hidden_size * 4].reshape(-1) + + r_data = np.concatenate([r1, r2, r3, r4, r_r1, r_r2, r_r3, r_r4]) + print("r_data shape: ", r_data.shape) + + if data.name == node[i].input[3]: + dims = data.dims + b_origin_data = np.frombuffer(data.raw_data, dtype=np.float32) + B_save = b_origin_data.reshape(dims) + bias_ih = B_save[0, : hidden_size * 4] + bias_hh = B_save[0, hidden_size * 4 : hidden_size * 8] + bias_f = bias_ih + bias_hh # bias add merge + bias_r_ih = B_save[1, : hidden_size * 4] + bias_r_hh = B_save[1, hidden_size * 4 : hidden_size * 8] + bias_r = bias_r_ih + bias_r_hh # bias add merge + b_data = np.concatenate([bias_f, bias_r]) + print("b_data shape: ", b_data.shape) + + for save_data in initializer: + if w_name == save_data.name: + save_data.raw_data=w_data.astype(np.float32).tobytes() + + elif r_name == save_data.name: + save_data.raw_data=r_data.astype(np.float32).tobytes() + + elif b_name == save_data.name: + save_data.raw_data=b_data.astype(np.float32).tobytes() + save_data.dims[1] = int(save_data.dims[1] / 2) + + + for data in initializer: + + if data.name == "p2o.helper.constant.2": + raw_data = np.frombuffer(data.raw_data, dtype=np.int64) + tmp_data = raw_data.copy() + tmp_data[0] = 1 + # tmp_data[0] = 16 + tmp_data[1] = -1 + tmp_data[2] = 1248 + data.raw_data = tmp_data.tobytes() + + lstm_reshape_name = "p2o.helper.constant.4" + # batch size: 1 + lstm_reshape_params = helper.make_tensor(lstm_reshape_name, onnx.TensorProto.INT64, [3], [-1,1,2048]) + # batch size: 16 + # lstm_reshape_params = helper.make_tensor(lstm_reshape_name, onnx.TensorProto.INT64, [3], [-1,16,2048]) + initializer.append(lstm_reshape_params) + + first_reshape_node = True + for i in range(len(node)): + if node[i].op_type == "Reshape": + if first_reshape_node: + first_reshape_node = False + continue + else: + node[i].input[1] = lstm_reshape_name + + onnx.save(onnx_model, args.output_onnx) + + + +if __name__ == "__main__": + args = parse_args() + convert_weights(args) + print("Save Down!") diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/cut_onnx_model.py b/models/speech/speech_recognition/deepspeech2/ixrt/cut_onnx_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9f7ad073016994d9da0c676eb156a51605219547 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/cut_onnx_model.py @@ -0,0 +1,12 @@ +import os +import onnx + +base_path = "../../../../../data/checkpoints/deepspeech2" + +raw_path = os.path.join(base_path, "deepspeech2_all.onnx") +save_path = os.path.join(base_path, "deepspeech2_part.onnx") + +input_names = ["input"] +output_names = ["layer_norm_9.tmp_2"] + +onnx.utils.extract_model(raw_path, save_path, input_names, output_names) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/data/decoder.pdparams b/models/speech/speech_recognition/deepspeech2/ixrt/data/decoder.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..c56060aa93728a5d471e95912f491ff0e021dcbe Binary files /dev/null and b/models/speech/speech_recognition/deepspeech2/ixrt/data/decoder.pdparams differ diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/data/demo_002_en.wav b/models/speech/speech_recognition/deepspeech2/ixrt/data/demo_002_en.wav new file mode 100644 index 0000000000000000000000000000000000000000..6dec925262b87ad659421edce892b0ab3b5039c4 Binary files /dev/null and b/models/speech/speech_recognition/deepspeech2/ixrt/data/demo_002_en.wav differ diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/data/mean_std.json b/models/speech/speech_recognition/deepspeech2/ixrt/data/mean_std.json new file mode 100644 index 0000000000000000000000000000000000000000..0867476f50aa0806df9ae6403a8d717c20cbe2ad --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/data/mean_std.json @@ -0,0 +1 @@ +{"mean_stat": [24156894.0, 12346911.0, 22422352.0, 24839050.0, -39564016.0, 26636840.0, 25011566.0, 24835082.0, 29086770.0, -39564016.0, 30332006.0, 27864978.0, 29913940.0, 30884468.0, 27886950.0, 32087904.0, -39564016.0, 32251384.0, 22983980.0, 31812588.0, 27079002.0, 31846070.0, 27868144.0, 32247188.0, 27818408.0, 32808634.0, 28970758.0, 33002436.0, 30088680.0, 32556118.0, 31661602.0, 31608772.0, 32695986.0, 31184330.0, 32831278.0, 31139706.0, 31246538.0, 32284240.0, 30891642.0, 31133906.0, 31388504.0, 30680138.0, 30491506.0, 31091484.0, 30377426.0, 30421270.0, 30813302.0, 30274476.0, 30598140.0, 30471758.0, 30579814.0, 30329296.0, 30493232.0, 30623062.0, 30493558.0, 30204112.0, 30423514.0, 30253188.0, 30602832.0, 30749188.0, 30818314.0, 30849054.0, 30869560.0, 30913844.0, 30980630.0, 31086216.0, 31331224.0, 31680480.0, 31968040.0, 32049322.0, 32063286.0, 32256122.0, 32434294.0, 32573840.0, 32674094.0, 32730916.0, 32978416.0, 33109402.0, 32979748.0, 33051350.0, 33052606.0, 33441594.0, 33257368.0, 33116758.0, 33340022.0, 33602192.0, 33161626.0, 33500680.0, 33578800.0, 33243446.0, 33923204.0, 33347320.0, 34025008.0, 33657892.0, 34250184.0, 33890276.0, 34463444.0, 34195792.0, 34383156.0, 34520940.0, 34386284.0, 34438136.0, 34548276.0, 34413796.0, 34502292.0, 34560776.0, 34626944.0, 34570476.0, 34526264.0, 34546036.0, 34544248.0, 34544372.0, 34543380.0, 34524356.0, 34496476.0, 34466152.0, 34515864.0, 34529828.0, 34519284.0, 34534880.0, 34563896.0, 34623720.0, 34452040.0, 34501760.0, 34289436.0, 34102164.0, 34146876.0, 33918084.0, 33886240.0, 33774224.0, 33625140.0, 33574368.0, 33387480.0, 33303508.0, 33195294.0, 33030560.0, 32926086.0, 32853062.0, 32793650.0, 32764236.0, 32693808.0, 32635580.0, 32590254.0, 32567152.0, 32705322.0, 32613326.0, 32610814.0, 32676314.0, 32564762.0, 32590186.0, 32465998.0, 32398008.0, 32458644.0, 32346022.0, 32200392.0, 32081072.0, 31974116.0, 31883154.0, 31762528.0, 31613754.0, 31392360.0], "var_stat": [272001248.0, 94793536.0, 243428896.0, 290309088.0, 637701760.0, 336307072.0, 301866720.0, 305879168.0, 399924352.0, 637701760.0, 435734816.0, 376857024.0, 429138816.0, 453332832.0, 381586304.0, 484897120.0, 637701760.0, 487575968.0, 277610784.0, 473961248.0, 359646656.0, 474523328.0, 377121184.0, 486117824.0, 377040512.0, 502208736.0, 403991680.0, 507742560.0, 432004512.0, 495375488.0, 471045696.0, 469675360.0, 497274720.0, 456176672.0, 499777376.0, 454574560.0, 457307136.0, 483500736.0, 445531520.0, 452447296.0, 457813120.0, 438017440.0, 434063200.0, 448154080.0, 428310304.0, 428972224.0, 439074464.0, 424764320.0, 431150144.0, 427725120.0, 430921920.0, 424025312.0, 426546304.0, 429184832.0, 425736928.0, 419302016.0, 424773664.0, 419875488.0, 427891680.0, 431239648.0, 432819488.0, 433514496.0, 433968800.0, 434929376.0, 436460768.0, 438922048.0, 443221952.0, 449724736.0, 456049792.0, 459802528.0, 463549376.0, 469183936.0, 474003584.0, 477739808.0, 480394752.0, 481895008.0, 488445952.0, 491934816.0, 488405504.0, 490087968.0, 489980800.0, 500284928.0, 495156320.0, 491229760.0, 496754208.0, 503509888.0, 491718784.0, 500877728.0, 503081248.0, 494274336.0, 512897952.0, 497379520.0, 516031520.0, 506352192.0, 523052768.0, 513294720.0, 529277536.0, 521843712.0, 527096864.0, 530821984.0, 526553824.0, 527675936.0, 530968416.0, 527220224.0, 529501184.0, 530768864.0, 532748704.0, 531494624.0, 529949632.0, 530147264.0, 529992288.0, 529933504.0, 529808640.0, 529262784.0, 528516896.0, 527792320.0, 529182784.0, 529615936.0, 529410560.0, 529778816.0, 530344224.0, 531732640.0, 525803968.0, 526245088.0, 522054464.0, 517174528.0, 518361600.0, 511909600.0, 510847392.0, 507652000.0, 503427968.0, 501848896.0, 496581568.0, 493911488.0, 490921440.0, 486737472.0, 483823584.0, 481722496.0, 480019008.0, 478763264.0, 476989056.0, 475359136.0, 473979872.0, 473168672.0, 476439520.0, 474125792.0, 473999968.0, 475610112.0, 472580128.0, 473176800.0, 469737056.0, 467798176.0, 468517056.0, 465801504.0, 462322720.0, 459281952.0, 456547808.0, 454174240.0, 450991168.0, 447175136.0, 441819584.0], "frame_num": 2454662} \ No newline at end of file diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/data/preprocess.yaml b/models/speech/speech_recognition/deepspeech2/ixrt/data/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f526e0ad34206199ee5593c6ca190409a335593 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/data/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 161 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/dataset/__init__.py b/models/speech/speech_recognition/deepspeech2/ixrt/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf3a39f671298828c16a584ca13ba67b3d68f2ae --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/dataset/__init__.py @@ -0,0 +1,2 @@ + +from .librispeech import LibriSpeech diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/dataset/librispeech.py b/models/speech/speech_recognition/deepspeech2/ixrt/dataset/librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..1b0a88a617369677b40f2d3d2b7926d76803635c --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/dataset/librispeech.py @@ -0,0 +1,70 @@ +""" +Define the dataset of LibriSpeech +""" + +import os +import glob +import soundfile +import numpy as np + + +class LibriSpeech: + def __init__(self, dataroot, transform=None): + assert os.path.exists(dataroot), f"The {dataroot} must be existed!" + self.dataroot = dataroot + self._parse_file() + self.transform = transform + + def _parse_file(self): + """ + Parse the test-clean data and groundtruth + """ + audio_names = [] + text_transcripts = [] + text_pattern = os.path.join(self.dataroot, "test-clean", "*", "*", "*.trans.txt") + text_files = glob.glob(text_pattern) + print(f"[INFO]: text files length: {len(text_files)}") + + for text_file in text_files: + # print(f"Processing: {os.path.basename(text_file)}") + + with open(text_file, 'r') as f: + lines = f.readlines() + + lines = [line.strip().split(' ', maxsplit=1) for line in lines] + for line in lines: + audio_name, text = line + audio_names.append(audio_name) + text_transcripts.append(text) + + self.audio_names = audio_names + self.text_transcripts = text_transcripts + print("[INFO]: Achieve Parsing!") + + def __len__(self): + return len(self.audio_names) + + def __getitem__(self, idx): + + audio_name = self.audio_names[idx] + text_gt = self.text_transcripts[idx] + + # print(f"audio_name: {audio_name}") + # print(f"text_gt: {text_gt}") + + name, subname, _ = audio_name.split('-') + audio_file = os.path.join(self.dataroot, "test-clean", name, subname, audio_name + ".flac") + + audio, sample_rate = soundfile.read(audio_file, dtype="int16", always_2d=True) + audio = audio[:, 0] + # print(f"audio shape: {audio.shape}") + + if self.transform is None: + input_data = audio + else: + preprocess_args = {"train": False} + input_data = self.transform(audio, **preprocess_args) + + input_data = np.expand_dims(input_data.astype(np.float32), axis=0) + return input_data, text_gt + diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/__init__.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1f235fe7d0bc2fb623e23493506ef9896a392fbf --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/__init__.py @@ -0,0 +1,2 @@ + +from .ctc import CTCDecoder diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/align.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/align.py new file mode 100644 index 0000000000000000000000000000000000000000..34d796145c65fa430ef0d05251ce7a728d9d8f9b --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/align.py @@ -0,0 +1,162 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +from paddle import nn +""" + To align the initializer between paddle and torch, + the API below are set defalut initializer with priority higger than global initializer. +""" +global_init_type = None + + +class LayerNorm(nn.LayerNorm): + def __init__(self, + normalized_shape, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(1.0)) + if bias_attr is None: + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(0.0)) + super(LayerNorm, self).__init__(normalized_shape, epsilon, weight_attr, + bias_attr, name) + + +class BatchNorm1D(nn.BatchNorm1D): + def __init__(self, + num_features, + momentum=0.9, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + data_format='NCL', + name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(1.0)) + if bias_attr is None: + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.Constant(0.0)) + super(BatchNorm1D, + self).__init__(num_features, momentum, epsilon, weight_attr, + bias_attr, data_format, name) + + +class Embedding(nn.Embedding): + def __init__(self, + num_embeddings, + embedding_dim, + padding_idx=None, + sparse=False, + weight_attr=None, + name=None): + if weight_attr is None: + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal()) + super(Embedding, self).__init__(num_embeddings, embedding_dim, + padding_idx, sparse, weight_attr, name) + + +class Linear(nn.Linear): + def __init__(self, + in_features, + out_features, + weight_attr=None, + bias_attr=None, + name=None): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + super(Linear, self).__init__(in_features, out_features, weight_attr, + bias_attr, name) + + +class Conv1D(nn.Conv1D): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + data_format='NCL'): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + super(Conv1D, self).__init__( + in_channels, out_channels, kernel_size, stride, padding, dilation, + groups, padding_mode, weight_attr, bias_attr, data_format) + + +class Conv2D(nn.Conv2D): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + data_format='NCHW'): + if weight_attr is None: + if global_init_type == "kaiming_uniform": + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + if bias_attr is None: + if global_init_type == "kaiming_uniform": + bias_attr = paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform( + fan_in=None, + negative_slope=math.sqrt(5), + nonlinearity='leaky_relu')) + super(Conv2D, self).__init__( + in_channels, out_channels, kernel_size, stride, padding, dilation, + groups, padding_mode, weight_attr, bias_attr, data_format) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc.py new file mode 100644 index 0000000000000000000000000000000000000000..1069bec8ef2ac30b7ee6da831cbb7a4a79673551 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc.py @@ -0,0 +1,443 @@ +import sys +from typing import Union + +import paddle +from paddle import nn +from paddle.nn import functional as F + +from .align import Linear +from .loss import CTCLoss + +from . import ctc_utils +from .ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401 +from .ctcdecoder import ctc_greedy_decoding # noqa: F401 +from .ctcdecoder import Scorer # noqa: F401 +from .ctcdecoder import CTCBeamSearchDecoder # noqa: F401 + + +__all__ = ['CTCDecoder'] + + +class CTCDecoderBase(nn.Layer): + def __init__(self, + odim, + enc_n_units, + blank_id=0, + dropout_rate: float=0.0, + reduction: Union[str, bool]=True, + batch_average: bool=True, + grad_norm_type: Union[str, None]=None): + """CTC decoder + + Args: + odim ([int]): text vocabulary size + enc_n_units ([int]): encoder output dimention + dropout_rate (float): dropout rate (0.0 ~ 1.0) + reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' + batch_average (bool): do batch dim wise average. + grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None. + """ + super().__init__() + + self.blank_id = blank_id + self.odim = odim + self.dropout = nn.Dropout(dropout_rate) + self.ctc_lo = Linear(enc_n_units, self.odim) + if isinstance(reduction, bool): + reduction_type = "sum" if reduction else "none" + else: + reduction_type = reduction + self.criterion = CTCLoss( + blank=self.blank_id, + reduction=reduction_type, + batch_average=batch_average, + grad_norm_type=grad_norm_type) + + def forward(self, hs_pad, hlens, ys_pad, ys_lens): + """Calculate CTC loss. + + Args: + hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D) + hlens (Tensor): batch of lengths of hidden state sequences (B) + ys_pad (Tensor): batch of padded character id sequence tensor (B, Lmax) + ys_lens (Tensor): batch of lengths of character sequence (B) + Returns: + loss (Tensor): ctc loss value, scalar. + """ + logits = self.ctc_lo(self.dropout(hs_pad)) + loss = self.criterion(logits, ys_pad, hlens, ys_lens) + return loss + + def softmax(self, eouts: paddle.Tensor, temperature: float=1.0): + """Get CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + probs (FloatTensor): `[B, T, odim]` + """ + self.probs = F.softmax(self.ctc_lo(eouts) / temperature, axis=2) + return self.probs + + def log_softmax(self, hs_pad: paddle.Tensor, + temperature: float=1.0) -> paddle.Tensor: + """log_softmax of frame activations + Args: + Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim) + """ + return F.log_softmax(self.ctc_lo(hs_pad) / temperature, axis=2) + + def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor: + """argmax of frame activations + Args: + paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: argmax applied 2d tensor (B, Tmax) + """ + return paddle.argmax(self.ctc_lo(hs_pad), dim=2) + + def forced_align(self, + ctc_probs: paddle.Tensor, + y: paddle.Tensor, + blank_id=0) -> list: + """ctc forced alignment. + Args: + ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D) + y (paddle.Tensor): label id sequence tensor, 1d tensor (L) + blank_id (int): blank symbol index + Returns: + paddle.Tensor: best alignment result, (T). + """ + return ctc_utils.forced_align(ctc_probs, y, blank_id) + + +class CTCDecoder(CTCDecoderBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # CTCDecoder LM Score handle + self._ext_scorer = None + self.beam_search_decoder = None + + def _decode_batch_greedy_offline(self, probs_split, vocab_list): + """This function will be deprecated in future. + Decode by best path for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :return: List of transcription texts. + :rtype: List of str + """ + results = [] + for i, probs in enumerate(probs_split): + output_transcription = ctc_greedy_decoding( + probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id) + results.append(output_transcription) + return results + + def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, + vocab_list): + """Initialize the external scorer. + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param language_model_path: Filepath for language model. If it is + empty, the external scorer will be set to + None, and the decoding method will be pure + beam search without scorer. + :type language_model_path: str|None + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + """ + # init once + if self._ext_scorer is not None: + return + + if language_model_path != '': + print("begin to initialize the external scorer " + "for decoding") + self._ext_scorer = Scorer(beam_alpha, beam_beta, + language_model_path, vocab_list) + lm_char_based = self._ext_scorer.is_character_based() + lm_max_order = self._ext_scorer.get_max_order() + lm_dict_size = self._ext_scorer.get_dict_size() + print("language model: " + "is_character_based = %d," % lm_char_based + + " max_order = %d," % lm_max_order + " dict_size = %d" % + lm_dict_size) + print("end initializing scorer") + else: + self._ext_scorer = None + print("no language model provided, " + "decoding by pure beam search without scorer.") + + def _decode_batch_beam_search_offline( + self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, vocab_list, num_processes): + """ + This function will be deprecated in future. + Decode by beam search for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of str + """ + if self._ext_scorer is not None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + + # beam search decode + num_processes = min(num_processes, len(probs_split)) + beam_search_results = ctc_beam_search_decoding_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + blank_id=self.blank_id) + + results = [result[0][1] for result in beam_search_results] + return results + + def init_decoder(self, batch_size, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ + init ctc decoders + Args: + batch_size(int): Batch size for input data + vocab_list (list): List of tokens in the vocabulary, for decoding + decoding_method (str): ctc_beam_search + lang_model_path (str): language model path + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + num_processes (int): num_processes + + Raises: + ValueError: when decoding_method not support. + + Returns: + CTCBeamSearchDecoder + """ + self.batch_size = batch_size + self.vocab_list = vocab_list + self.decoding_method = decoding_method + self.beam_size = beam_size + self.cutoff_prob = cutoff_prob + self.cutoff_top_n = cutoff_top_n + self.num_processes = num_processes + if decoding_method == "ctc_beam_search": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + if self.beam_search_decoder is None: + self.beam_search_decoder = self.get_decoder( + vocab_list, batch_size, beam_alpha, beam_beta, beam_size, + num_processes, cutoff_prob, cutoff_top_n) + return self.beam_search_decoder + elif decoding_method == "ctc_greedy": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + else: + raise ValueError(f"Not support: {decoding_method}") + + def decode_probs_offline(self, probs, logits_lens, vocab_list, + decoding_method, lang_model_path, beam_alpha, + beam_beta, beam_size, cutoff_prob, cutoff_top_n, + num_processes): + """ + This function will be deprecated in future. + ctc decoding with probs. + Args: + probs (Tensor): activation after softmax + logits_lens (Tensor): audio output lens + vocab_list (list): List of tokens in the vocabulary, for decoding + decoding_method (str): ctc_beam_search + lang_model_path (str): language model path + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + num_processes (int): num_processes + + Raises: + ValueError: when decoding_method not support. + + Returns: + List[str]: transcripts. + """ + probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] + if decoding_method == "ctc_greedy": + result_transcripts = self._decode_batch_greedy_offline( + probs_split=probs_split, vocab_list=vocab_list) + elif decoding_method == "ctc_beam_search": + result_transcripts = self._decode_batch_beam_search_offline( + probs_split=probs_split, + beam_alpha=beam_alpha, + beam_beta=beam_beta, + beam_size=beam_size, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + vocab_list=vocab_list, + num_processes=num_processes) + else: + raise ValueError(f"Not support: {decoding_method}") + return result_transcripts + + def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta, + beam_size, num_processes, cutoff_prob, cutoff_top_n): + """ + init get ctc decoder + Args: + vocab_list (list): List of tokens in the vocabulary, for decoding. + batch_size(int): Batch size for input data + beam_alpha (float): beam_alpha + beam_beta (float): beam_beta + beam_size (int): beam_size + num_processes (int): num_processes + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + + Raises: + ValueError: when decoding_method not support. + + Returns: + CTCBeamSearchDecoder + """ + num_processes = min(num_processes, batch_size) + if self._ext_scorer is not None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + if self.decoding_method == "ctc_beam_search": + beam_search_decoder = CTCBeamSearchDecoder( + vocab_list, batch_size, beam_size, num_processes, cutoff_prob, + cutoff_top_n, self._ext_scorer, self.blank_id) + else: + raise ValueError(f"Not support: {decoding_method}") + return beam_search_decoder + + def next(self, probs, logits_lens): + """ + Input probs into ctc decoder + Args: + probs (list(list(float))): probs for a batch of data + logits_lens (list(int)): logits lens for a batch of data + Raises: + Exception: when the ctc decoder is not initialized + ValueError: when decoding_method not support. + """ + + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + beam_search_decoder = self.beam_search_decoder + + has_value = (logits_lens > 0).tolist() + has_value = [ + "true" if has_value[i] is True else "false" + for i in range(len(has_value)) + ] + probs_split = [ + probs[i, :l, :].tolist() if has_value[i] else probs[i].tolist() + for i, l in enumerate(logits_lens) + ] + if self.decoding_method == "ctc_beam_search": + beam_search_decoder.next(probs_split, has_value) + else: + raise ValueError(f"Not support: {decoding_method}") + + return + + def decode(self): + """ + Get the decoding result + Raises: + Exception: when the ctc decoder is not initialized + ValueError: when decoding_method not support. + Returns: + results_best (list(str)): The best result for a batch of data + results_beam (list(list(str))): The beam search result for a batch of data + """ + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + + beam_search_decoder = self.beam_search_decoder + if self.decoding_method == "ctc_beam_search": + batch_beam_results = beam_search_decoder.decode() + batch_beam_results = [[(res[0], res[1]) for res in beam_results] + for beam_results in batch_beam_results] + results_best = [result[0][1] for result in batch_beam_results] + results_beam = [[trans[1] for trans in result] + for result in batch_beam_results] + + else: + raise ValueError(f"Not support: {decoding_method}") + + return results_best, results_beam + + def reset_decoder(self, + batch_size=-1, + beam_size=-1, + num_processes=-1, + cutoff_prob=-1.0, + cutoff_top_n=-1): + if batch_size > 0: + self.batch_size = batch_size + if beam_size > 0: + self.beam_size = beam_size + if num_processes > 0: + self.num_processes = num_processes + if cutoff_prob > 0: + self.cutoff_prob = cutoff_prob + if cutoff_top_n > 0: + self.cutoff_top_n = cutoff_top_n + """ + Reset the decoder state + Args: + batch_size(int): Batch size for input data + beam_size (int): beam_size + num_processes (int): num_processes + cutoff_prob (float): cutoff probability in beam search + cutoff_top_n (int): cutoff_top_n + Raises: + Exception: when the ctc decoder is not initialized + """ + if self.beam_search_decoder is None: + raise Exception( + "You need to initialize the beam_search_decoder firstly") + self.beam_search_decoder.reset_state( + self.batch_size, self.beam_size, self.num_processes, + self.cutoff_prob, self.cutoff_top_n) + + def del_decoder(self): + """ + Delete the decoder + """ + if self.beam_search_decoder is not None: + del self.beam_search_decoder + self.beam_search_decoder = None diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc_utils.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..df60028cd1fd77f6c534a78bf4844c3a49a668ce --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctc_utils.py @@ -0,0 +1,195 @@ +from pathlib import Path +from typing import List + +import numpy as np +import paddle + +from .utils import text_grid +from .utils import utility + + +__all__ = ["forced_align", "remove_duplicates_and_blank", "insert_blank"] + + +def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: + """ctc alignment to ctc label ids. + + "abaa-acee-" -> "abaace" + + Args: + hyp (List[int]): hypotheses ids, (L) + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[int]: remove dupicate ids, then remove blank id. + """ + new_hyp: List[int] = [] + cur = 0 + while cur < len(hyp): + # add non-blank into new_hyp + if hyp[cur] != blank_id: + new_hyp.append(hyp[cur]) + # skip repeat label + prev = cur + while cur < len(hyp) and hyp[cur] == hyp[prev]: + cur += 1 + return new_hyp + + +def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: + """Insert blank token between every two label token. + + "abcdefg" -> "-a-b-c-d-e-f-g-" + + Args: + label ([np.ndarray]): label ids, List[int], (L). + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + [np.ndarray]: (2L+1). + """ + label = np.expand_dims(label, 1) #[L, 1] + blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id + label = np.concatenate([blanks, label], axis=1) #[L, 2] + label = label.reshape(-1) #[2L], -l-l-l + label = np.append(label, label[0]) #[2L + 1], -l-l-l- + return label + + +def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, + blank_id=0) -> List[int]: + """ctc forced alignment. + + https://distill.pub/2017/ctc/ + + Args: + ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D) + y (paddle.Tensor): label id sequence tensor, 1d tensor (L) + blank_id (int): blank symbol index + Returns: + List[int]: best alignment result, (T). + """ + y_insert_blank = insert_blank(y, blank_id) #(2L+1) + + log_alpha = paddle.zeros( + (ctc_probs.shape[0], len(y_insert_blank))) #(T, 2L+1) + log_alpha = log_alpha - float('inf') # log of zero + + # TODO(Hui Zhang): zeros not support paddle.int16 + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 + state_path = (paddle.zeros( + (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1 + ) # state path, Tuple((T, 2L+1)) + + # init start state + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb + + for t in range(1, ctc_probs.shape[0]): # T + for s in range(len(y_insert_blank)): # 2L+1 + if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ + s] == y_insert_blank[s - 2]: + candidates = paddle.to_tensor( + [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) + prev_state = [s, s - 1] + else: + candidates = paddle.to_tensor([ + log_alpha[t - 1, s], + log_alpha[t - 1, s - 1], + log_alpha[t - 1, s - 2], + ]) + prev_state = [s, s - 1, s - 2] + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( + y_insert_blank[s])] + state_path[t, s] = prev_state[paddle.argmax(candidates)] + # TODO(Hui Zhang): zeros not support paddle.int16 + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 + state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32) + + candidates = paddle.to_tensor([ + log_alpha[-1, len(y_insert_blank) - 1], # Sb + log_alpha[-1, len(y_insert_blank) - 2] # Snb + ]) + prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] + state_seq[-1] = prev_state[paddle.argmax(candidates)] + for t in range(ctc_probs.shape[0] - 2, -1, -1): + state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] + + output_alignment = [] + for t in range(0, ctc_probs.shape[0]): + output_alignment.append(y_insert_blank[state_seq[t, 0]]) + + return output_alignment + + +def ctc_align(config, model, dataloader, batch_size, stride_ms, token_dict, + result_file): + """ctc alignment. + + Args: + config (cfgNode): config + model (nn.Layer): U2 Model. + dataloader (io.DataLoader): dataloader. + batch_size (int): decoding batchsize. + stride_ms (int): audio feature stride in ms unit. + token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '']. + result_file (str): alignment output file, e.g. /path/to/xxx.align. + """ + if batch_size > 1: + print('alignment mode must be running with batch_size == 1') + sys.exit(1) + assert result_file and result_file.endswith('.align') + + model.eval() + # conv subsampling rate + subsample = utility.get_subsample(config) + print(f"Align Total Examples: {len(dataloader.dataset)}") + + with open(result_file, 'w') as fout: + # one example in batch + for i, batch in enumerate(dataloader): + key, feat, feats_length, target, target_length = batch + + # 1. Encoder + encoder_out, encoder_mask = model._forward_encoder( + feat, feats_length) # (B, maxlen, encoder_dim) + maxlen = encoder_out.shape[1] + ctc_probs = model.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + + # 2. alignment + ctc_probs = ctc_probs.squeeze(0) + target = target.squeeze(0) + alignment = forced_align(ctc_probs, target) + + print(f"align ids: {key[0]} {alignment}") + fout.write('{} {}\n'.format(key[0], alignment)) + + # 3. gen praat + # segment alignment + align_segs = text_grid.segment_alignment(alignment) + print(f"align tokens: {key[0]}, {align_segs}") + + # IntervalTier, List["start end token\n"] + tierformat = text_grid.align_to_tierformat(align_segs, subsample, + token_dict) + + # write tier + align_output_path = Path(result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: + f.writelines(tierformat) + + # write textgrid + textgrid_path = align_output_path / (key[0] + ".TextGrid") + second_per_frame = 1. / (1000. / + stride_ms) # 25ms window, 10ms stride + second_per_example = ( + len(alignment) + 1) * subsample * second_per_frame + text_grid.generate_textgrid( + maxtime=second_per_example, + intervals=tierformat, + output=str(textgrid_path)) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/__init__.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..37ceae6e5f8c3016713c4417ea167dec9e3fdc42 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .swig_wrapper import ctc_beam_search_decoding +from .swig_wrapper import ctc_beam_search_decoding_batch +from .swig_wrapper import ctc_greedy_decoding +from .swig_wrapper import CTCBeamSearchDecoder +from .swig_wrapper import Scorer diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/decoders_deprecated.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/decoders_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..0c391ead1ffd3e42ee71586ff1ea9cdd1b1d5285 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/decoders_deprecated.py @@ -0,0 +1,235 @@ +"""Contains various CTC decoders.""" +import multiprocessing +from itertools import groupby +from math import log + +import numpy as np + + +def ctc_greedy_decoder(probs_seq, vocabulary): + """CTC greedy (best path) decoder. + + Path consisting of the most probable tokens are further post-processed to + remove consecutive repetitions and all blanks. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: baseline + """ + # dimension verification + for probs in probs_seq: + if not len(probs) == len(vocabulary) + 1: + raise ValueError("probs_seq dimension mismatchedd with vocabulary") + # argmax to get the best index for each time step + max_index_list = list(np.array(probs_seq).argmax(axis=1)) + # remove consecutive duplicate indexes + index_list = [index_group[0] for index_group in groupby(max_index_list)] + # remove blank indexes + blank_index = len(vocabulary) + index_list = [index for index in index_list if index != blank_index] + # convert index list to string + return ''.join([vocabulary[index] for index in index_list]) + + +def ctc_beam_search_decoder(probs_seq, + beam_size, + vocabulary, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + nproc=False): + """CTC Beam search decoder. + + It utilizes beam search to approximately select top best decoding + labels and returning results in the descending order. + The implementation is based on Prefix Beam Search + (https://arxiv.org/abs/1408.2873), and the unclear part is + redesigned. Two important modifications: 1) in the iterative computation + of probabilities, the assignment operation is changed to accumulation for + one prefix may comes from different paths; 2) the if condition "if l^+ not + in A_prev then" after probabilities' computation is deprecated for it is + hard to understand and seems unnecessary. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param beam_size: Width for beam search. + :type beam_size: int + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_func: callable + :param nproc: Whether the decoder used in multiprocesses. + :type nproc: bool + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + # dimension check + for prob_list in probs_seq: + if not len(prob_list) == len(vocabulary) + 1: + raise ValueError("The shape of prob_seq does not match with the " + "shape of the vocabulary.") + + # blank_id assign + blank_id = len(vocabulary) + + # If the decoder called in the multiprocesses, then use the global scorer + # instantiated in ctc_beam_search_decoder_batch(). + if nproc is True: + global ext_nproc_scorer + ext_scoring_func = ext_nproc_scorer + + # initialize + # prefix_set_prev: the set containing selected prefixes + # probs_b_prev: prefixes' probability ending with blank in previous step + # probs_nb_prev: prefixes' probability ending with non-blank in previous step + prefix_set_prev = {'\t': 1.0} + probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0} + + # extend prefix in loop + for time_step in range(len(probs_seq)): + # prefix_set_next: the set containing candidate prefixes + # probs_b_cur: prefixes' probability ending with blank in current step + # probs_nb_cur: prefixes' probability ending with non-blank in current step + prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {} + + prob_idx = list(enumerate(probs_seq[time_step])) + cutoff_len = len(prob_idx) + # If pruning is enabled + if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len: + prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True) + cutoff_len, cum_prob = 0, 0.0 + for i in range(len(prob_idx)): + cum_prob += prob_idx[i][1] + cutoff_len += 1 + if cum_prob >= cutoff_prob: + break + cutoff_len = min(cutoff_len, cutoff_top_n) + prob_idx = prob_idx[0:cutoff_len] + + for l in prefix_set_prev: + if l not in prefix_set_next: + probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0 + + # extend prefix by travering prob_idx + for index in range(cutoff_len): + c, prob_c = prob_idx[index][0], prob_idx[index][1] + + if c == blank_id: + probs_b_cur[l] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + last_char = l[-1] + new_char = vocabulary[c] + l_plus = l + new_char + if l_plus not in prefix_set_next: + probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0 + + if new_char == last_char: + probs_nb_cur[l_plus] += prob_c * probs_b_prev[l] + probs_nb_cur[l] += prob_c * probs_nb_prev[l] + elif new_char == ' ': + if (ext_scoring_func is None) or (len(l) == 1): + score = 1.0 + else: + prefix = l[1:] + score = ext_scoring_func(prefix) + probs_nb_cur[l_plus] += score * prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + probs_nb_cur[l_plus] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + # add l_plus into prefix_set_next + prefix_set_next[l_plus] = probs_nb_cur[ + l_plus] + probs_b_cur[l_plus] + # add l into prefix_set_next + prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l] + # update probs + probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur + + # store top beam_size prefixes + prefix_set_prev = sorted( + prefix_set_next.items(), key=lambda asd: asd[1], reverse=True) + if beam_size < len(prefix_set_prev): + prefix_set_prev = prefix_set_prev[:beam_size] + prefix_set_prev = dict(prefix_set_prev) + + beam_result = [] + for seq, prob in prefix_set_prev.items(): + if prob > 0.0 and len(seq) > 1: + result = seq[1:] + # score last word by external scorer + if (ext_scoring_func is not None) and (result[-1] != ' '): + prob = prob * ext_scoring_func(result) + log_prob = log(prob) + beam_result.append((log_prob, result)) + else: + beam_result.append((float('-inf'), '')) + + # output top beam_size decoding results + beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) + return beam_result + + +def ctc_beam_search_decoder_batch(probs_split, + beam_size, + vocabulary, + num_processes, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None): + """CTC beam search decoder using multiple processes. + + :param probs_seq: 3-D list with each element as an instance of 2-D list + of probabilities used by ctc_beam_search_decoder(). + :type probs_seq: 3-D list + :param beam_size: Width for beam search. + :type beam_size: int + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param num_processes: Number of parallel processes. + :type num_processes: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param num_processes: Number of parallel processes. + :type num_processes: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_function: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + if not num_processes > 0: + raise ValueError("Number of processes must be positive!") + + # use global variable to pass the externnal scorer to beam search decoder + global ext_nproc_scorer + ext_nproc_scorer = ext_scoring_func + nproc = True + + pool = multiprocessing.Pool(processes=num_processes) + results = [] + for i, probs_list in enumerate(probs_split): + args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n, + None, nproc) + results.append(pool.apply_async(ctc_beam_search_decoder, args)) + + pool.close() + pool.join() + beam_search_results = [result.get() for result in results] + return beam_search_results diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/scorer_deprecated.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/scorer_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..362098fe65ec34106926e1804dfbb5abb273d97d --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/scorer_deprecated.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""External Scorer for Beam Search Decoder.""" +import os + +import kenlm +import numpy as np + + +class Scorer(object): + """External scorer to evaluate a prefix or whole sentence in + beam search decoding, including the score from n-gram language + model and word count. + + :param alpha: Parameter associated with language model. Don't use + language model when alpha = 0. + :type alpha: float + :param beta: Parameter associated with word count. Don't use word + count when beta = 0. + :type beta: float + :model_path: Path to load language model. + :type model_path: str + """ + + def __init__(self, alpha, beta, model_path): + self._alpha = alpha + self._beta = beta + if not os.path.isfile(model_path): + raise IOError("Invaid language model path: %s" % model_path) + self._language_model = kenlm.LanguageModel(model_path) + + # n-gram language model scoring + def _language_model_score(self, sentence): + #log10 prob of last word + log_cond_prob = list( + self._language_model.full_scores(sentence, eos=False))[-1][0] + return np.power(10, log_cond_prob) + + # word insertion term + def _word_count(self, sentence): + words = sentence.strip().split(' ') + return len(words) + + # reset alpha and beta + def reset_params(self, alpha, beta): + self._alpha = alpha + self._beta = beta + + # execute evaluation + def __call__(self, sentence, log=False): + """Evaluation function, gathering all the different scores + and return the final one. + + :param sentence: The input sentence for evaluation + :type sentence: str + :param log: Whether return the score in log representation. + :type log: bool + :return: Evaluation score, in the decimal or log. + :rtype: float + """ + lm = self._language_model_score(sentence) + word_cnt = self._word_count(sentence) + if log is False: + score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta) + else: + score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt) + return score diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/swig_wrapper.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/swig_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..e4eb43a50807bb602c7259a96adca47e60ee98ad --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/swig_wrapper.py @@ -0,0 +1,146 @@ +"""Wrapper for various CTC decoders in SWIG.""" +import paddlespeech_ctcdecoders + + +class Scorer(paddlespeech_ctcdecoders.Scorer): + """Wrapper for Scorer. + + :param alpha: Parameter associated with language model. Don't use + language model when alpha = 0. + :type alpha: float + :param beta: Parameter associated with word count. Don't use word + count when beta = 0. + :type beta: float + :model_path: Path to load language model. + :type model_path: str + :param vocabulary: Vocabulary list. + :type vocabulary: list + """ + + def __init__(self, alpha, beta, model_path, vocabulary): + paddlespeech_ctcdecoders.Scorer.__init__(self, alpha, beta, model_path, + vocabulary) + + +def ctc_greedy_decoding(probs_seq, vocabulary, blank_id): + """Wrapper for ctc best path decodeing function in swig. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: str + """ + result = paddlespeech_ctcdecoders.ctc_greedy_decoding(probs_seq.tolist(), + vocabulary, blank_id) + return result + + +def ctc_beam_search_decoding(probs_seq, + vocabulary, + beam_size, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the CTC Beam Search Decoding function. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param beam_size: Width for beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_func: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding( + probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n, + ext_scoring_func, blank_id) + beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results] + return beam_results + + +def ctc_beam_search_decoding_batch(probs_split, + vocabulary, + beam_size, + num_processes, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the batched CTC beam search decodeing batch function. + + :param probs_seq: 3-D list with each element as an instance of 2-D list + of probabilities used by ctc_beam_search_decoder(). + :type probs_seq: 3-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param beam_size: Width for beam search. + :type beam_size: int + :param num_processes: Number of parallel processes. + :type num_processes: int + :param cutoff_prob: Cutoff probability in vocabulary pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param num_processes: Number of parallel processes. + :type num_processes: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_function: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + probs_split = [probs_seq.tolist() for probs_seq in probs_split] + + batch_beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding_batch( + probs_split, vocabulary, beam_size, num_processes, cutoff_prob, + cutoff_top_n, ext_scoring_func, blank_id) + batch_beam_results = [[(res[0], res[1]) for res in beam_results] + for beam_results in batch_beam_results] + return batch_beam_results + + +class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch): + """Wrapper for CtcBeamSearchDecoderBatch. + Args: + vocab_list (list): Vocabulary list. + beam_size (int): Width for beam search. + num_processes (int): Number of parallel processes. + param cutoff_prob (float): Cutoff probability in vocabulary pruning, + default 1.0, no pruning. + cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count + or language model. + """ + + def __init__(self, vocab_list, batch_size, beam_size, num_processes, + cutoff_prob, cutoff_top_n, _ext_scorer, blank_id): + paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch.__init__( + self, vocab_list, batch_size, beam_size, num_processes, cutoff_prob, + cutoff_top_n, _ext_scorer, blank_id) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/test_decoders.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/test_decoders.py new file mode 100644 index 0000000000000000000000000000000000000000..dc344b763de4b2b4e93e4c452ad40a4e79022f5e --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/ctcdecoder/test_decoders.py @@ -0,0 +1,87 @@ +"""Test decoders.""" +import unittest + +import decoders_deprecated as decoder + + +class TestDecoders(unittest.TestCase): + def setUp(self): + self.vocab_list = ["\'", ' ', 'a', 'b', 'c', 'd'] + self.beam_size = 20 + self.probs_seq1 = [[ + 0.06390443, 0.21124858, 0.27323887, 0.06870235, 0.0361254, + 0.18184413, 0.16493624 + ], [ + 0.03309247, 0.22866108, 0.24390638, 0.09699597, 0.31895462, + 0.0094893, 0.06890021 + ], [ + 0.218104, 0.19992557, 0.18245131, 0.08503348, 0.14903535, + 0.08424043, 0.08120984 + ], [ + 0.12094152, 0.19162472, 0.01473646, 0.28045061, 0.24246305, + 0.05206269, 0.09772094 + ], [ + 0.1333387, 0.00550838, 0.00301669, 0.21745861, 0.20803985, + 0.41317442, 0.01946335 + ], [ + 0.16468227, 0.1980699, 0.1906545, 0.18963251, 0.19860937, + 0.04377724, 0.01457421 + ]] + self.probs_seq2 = [[ + 0.08034842, 0.22671944, 0.05799633, 0.36814645, 0.11307441, + 0.04468023, 0.10903471 + ], [ + 0.09742457, 0.12959763, 0.09435383, 0.21889204, 0.15113123, + 0.10219457, 0.20640612 + ], [ + 0.45033529, 0.09091417, 0.15333208, 0.07939558, 0.08649316, + 0.12298585, 0.01654384 + ], [ + 0.02512238, 0.22079203, 0.19664364, 0.11906379, 0.07816055, + 0.22538587, 0.13483174 + ], [ + 0.17928453, 0.06065261, 0.41153005, 0.1172041, 0.11880313, + 0.07113197, 0.04139363 + ], [ + 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, + 0.05294827, 0.22298418 + ]] + self.greedy_result = ["ac'bdc", "b'da"] + self.beam_search_result = ['acdc', "b'a"] + + def test_greedy_decoder_1(self): + bst_result = decoder.ctc_greedy_decoder(self.probs_seq1, + self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[0]) + + def test_greedy_decoder_2(self): + bst_result = decoder.ctc_greedy_decoder(self.probs_seq2, + self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[1]) + + def test_beam_search_decoder_1(self): + beam_result = decoder.ctc_beam_search_decoder( + probs_seq=self.probs_seq1, + beam_size=self.beam_size, + vocabulary=self.vocab_list) + self.assertEqual(beam_result[0][1], self.beam_search_result[0]) + + def test_beam_search_decoder_2(self): + beam_result = decoder.ctc_beam_search_decoder( + probs_seq=self.probs_seq2, + beam_size=self.beam_size, + vocabulary=self.vocab_list) + self.assertEqual(beam_result[0][1], self.beam_search_result[1]) + + def test_beam_search_decoder_batch(self): + beam_results = decoder.ctc_beam_search_decoder_batch( + probs_split=[self.probs_seq1, self.probs_seq2], + beam_size=self.beam_size, + vocabulary=self.vocab_list, + num_processes=24) + self.assertEqual(beam_results[0][0][1], self.beam_search_result[0]) + self.assertEqual(beam_results[1][0][1], self.beam_search_result[1]) + + +if __name__ == '__main__': + unittest.main() diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/loss.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d082135a422f7edd567c203e8d7f9edfac0231e0 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/loss.py @@ -0,0 +1,166 @@ +import inspect + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +__all__ = ['CTCLoss', "LabelSmoothingLoss"] + + +class CTCLoss(nn.Layer): + def __init__(self, + blank=0, + reduction='sum', + batch_average=False, + grad_norm_type=None): + super().__init__() + # last token id as blank id + self.loss = nn.CTCLoss(blank=blank, reduction=reduction) + self.batch_average = batch_average + + print(f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}") + print(f"CTCLoss Grad Norm Type: {grad_norm_type}") + + assert grad_norm_type in ('instance', 'batch', 'frame', None) + self.norm_by_times = False + self.norm_by_batchsize = False + self.norm_by_total_logits_len = False + if grad_norm_type is None: + # no grad norm + pass + elif grad_norm_type == 'instance': + self.norm_by_times = True + elif grad_norm_type == 'batch': + self.norm_by_batchsize = True + elif grad_norm_type == 'frame': + self.norm_by_total_logits_len = True + else: + raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}") + kwargs = { + "norm_by_times": self.norm_by_times, + "norm_by_batchsize": self.norm_by_batchsize, + "norm_by_total_logits_len": self.norm_by_total_logits_len, + } + + # Derive only the args which the func has + try: + param = inspect.signature(self.loss.forward).parameters + except ValueError: + # Some function, e.g. built-in function, are failed + param = {} + self._kwargs = {k: v for k, v in kwargs.items() if k in param} + _notin = {k: v for k, v in kwargs.items() if k not in param} + print(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") + + def forward(self, logits, ys_pad, hlens, ys_lens): + """Compute CTC loss. + + Args: + logits ([paddle.Tensor]): [B, Tmax, D] + ys_pad ([paddle.Tensor]): [B, Tmax] + hlens ([paddle.Tensor]): [B] + ys_lens ([paddle.Tensor]): [B] + + Returns: + [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. + """ + B = logits.shape[0] + # warp-ctc need logits, and do softmax on logits by itself + # warp-ctc need activation with shape [T, B, V + 1] + # logits: (B, L, D) -> (L, B, D) + logits = logits.transpose([1, 0, 2]) + ys_pad = ys_pad.astype(paddle.int32) + loss = self.loss(logits, ys_pad, hlens, ys_lens, **self._kwargs) + if self.batch_average: + # Batch-size average + loss = loss / B + return loss + + +class LabelSmoothingLoss(nn.Layer): + """Label-smoothing loss. + In a standard CE loss, the label's data distribution is: + [0,1,2] -> + [ + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + In the smoothing version CE Loss,some probabilities + are taken from the true label prob (1.0) and are divided + among other labels. + e.g. + smoothing=0.1 + [0,1,2] -> + [ + [0.9, 0.05, 0.05], + [0.05, 0.9, 0.05], + [0.05, 0.05, 0.9], + ] + + """ + + def __init__(self, + size: int, + padding_idx: int, + smoothing: float, + normalize_length: bool=False): + """Label-smoothing loss. + + Args: + size (int): the number of class + padding_idx (int): padding class id which will be ignored for loss + smoothing (float): smoothing rate (0.0 means the conventional CE) + normalize_length (bool): + True, normalize loss by sequence length; + False, normalize loss by batch size. + Defaults to False. + """ + super().__init__() + self.size = size + self.padding_idx = padding_idx + self.smoothing = smoothing + self.confidence = 1.0 - smoothing + self.normalize_length = normalize_length + self.criterion = nn.KLDivLoss(reduction="none") + + def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor: + """Compute loss between x and target. + The model outputs and data labels tensors are flatten to + (batch*seqlen, class) shape and a mask is applied to the + padding part which should not be calculated for loss. + + Args: + x (paddle.Tensor): prediction (batch, seqlen, class) + target (paddle.Tensor): + target signal masked with self.padding_id (batch, seqlen) + Returns: + loss (paddle.Tensor) : The KL loss, scalar float value + """ + B, T, D = x.shape + assert D == self.size + x = x.reshape((-1, self.size)) + target = target.reshape([-1]) + + # use zeros_like instead of torch.no_grad() for true_dist, + # since no_grad() can not be exported by JIT + true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) + ignore = target == self.padding_idx # (B,) + + #TODO(Hui Zhang): target = target * (1 - ignore) # avoid -1 index + target = target.masked_fill(ignore, 0) # avoid -1 index + # true_dist.scatter_(1, target.unsqueeze(1), self.confidence) + target_mask = F.one_hot(target, self.size) + true_dist *= (1 - target_mask) + true_dist += target_mask * self.confidence + + kl = self.criterion(F.log_softmax(x, axis=1), true_dist) + + #TODO(Hui Zhang): sum not support bool type + #total = len(target) - int(ignore.sum()) + total = len(target) - int(ignore.type_as(target).sum()) + denom = total if self.normalize_length else B + #numer = (kl * (1 - ignore)).sum() + numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum() + return numer / denom diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/text_grid.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/text_grid.py new file mode 100644 index 0000000000000000000000000000000000000000..4865249c3a235a5f5fe3b8a73c308c96d2d69415 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/text_grid.py @@ -0,0 +1,114 @@ +from typing import Dict +from typing import List +from typing import Text + +import textgrid + + +def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: + """segment ctc alignment ids by continuous blank and repeat label. + + Args: + alignment (List[int]): ctc alignment id sequence. + e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[List[int]]: token align, segment aligment id sequence. + e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + """ + # convert alignment to a praat format, which is a doing phonetics + # by computer and helps analyzing alignment + align_segs = [] + # get frames level duration for each token + start = 0 + end = 0 + while end < len(alignment): + while end < len(alignment) and alignment[end] == blank_id: # blank + end += 1 + if end == len(alignment): + align_segs[-1].extend(alignment[start:]) + break + end += 1 + while end < len(alignment) and alignment[end - 1] == alignment[ + end]: # repeat label + end += 1 + align_segs.append(alignment[start:end]) + start = end + return align_segs + + +def align_to_tierformat(align_segs: List[List[int]], + subsample: int, + token_dict: Dict[int, Text], + blank_id=0) -> List[Text]: + """Generate textgrid.Interval format from alignment segmentations. + + Args: + align_segs (List[List[int]]): segmented ctc alignment ids. + subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample + token_dict (Dict[int, Text]): int -> str map. + + Returns: + List[Text]: list of textgrid.Interval text, str(start, end, text). + """ + hop_length = 10 # ms + second_ms = 1000 # ms + frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length + second_per_frame = 1.0 / frame_per_second + + begin = 0 + duration = 0 + tierformat = [] + + for idx, tokens in enumerate(align_segs): + token_len = len(tokens) + token = tokens[-1] + # time duration in second + duration = token_len * subsample * second_per_frame + if idx < len(align_segs) - 1: + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + else: + for i in tokens: + if i != blank_id: + token = i + break + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + begin = begin + duration + + return tierformat + + +def generate_textgrid(maxtime: float, + intervals: List[Text], + output: Text, + name: Text='ali') -> None: + """Create alignment textgrid file. + + Args: + maxtime (float): audio duartion. + intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item. + output (Text): textgrid filepath. + name (Text, optional): tier or layer name. Defaults to 'ali'. + """ + # Download Praat: https://www.fon.hum.uva.nl/praat/ + avg_interval = maxtime / (len(intervals) + 1) + print(f"average second/token: {avg_interval}") + margin = 0.0001 + + tg = textgrid.TextGrid(maxTime=maxtime) + tier = textgrid.IntervalTier(name=name, maxTime=maxtime) + + i = 0 + for dur in intervals: + s, e, text = dur.split() + tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text) + + tg.append(tier) + + tg.write(output) + print("successfully generator textgrid {}.".format(output)) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/utility.py b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..29e758733fe7de7a394b43f964e6e017a5c57151 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/decoder/utils/utility.py @@ -0,0 +1,79 @@ +"""Contains common utility functions.""" +import math +import os +import random +import sys +from contextlib import contextmanager +from pprint import pformat +from typing import List + +import distutils.util +import numpy as np +import paddle +import soundfile + + +__all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"] + + +def all_version(): + vers = { + "python": sys.version, + "paddle": paddle.__version__, + "paddle_commit": paddle.version.commit, + "soundfile": soundfile.__version__, + } + print(f"Deps Module Version:{pformat(list(vers.items()))}") + + +@contextmanager +def UpdateConfig(config): + """Update yacs config""" + config.defrost() + yield + config.freeze() + + +def seed_all(seed: int=20210329): + """freeze random generator seed.""" + np.random.seed(seed) + random.seed(seed) + paddle.seed(seed) + + +def log_add(args: List[int]) -> float: + """Stable log add + + Args: + args (List[int]): log scores + + Returns: + float: sum of log scores + """ + if all(a == -float('inf') for a in args): + return -float('inf') + a_max = max(args) + lsp = math.log(sum(math.exp(a - a_max) for a in args)) + return a_max + lsp + + +def get_subsample(config): + """Subsample rate from config. + + Args: + config (yacs.config.CfgNode): yaml config + + Returns: + int: subsample rate. + """ + if config['encoder'] == 'squeezeformer': + return 4 + else: + input_layer = config["encoder_conf"]["input_layer"] + assert input_layer in ["conv2d", "conv2d6", "conv2d8"] + if input_layer == "conv2d": + return 4 + elif input_layer == "conv2d6": + return 6 + elif input_layer == "conv2d8": + return 8 diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/inference.py b/models/speech/speech_recognition/deepspeech2/ixrt/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..ac4e4ddbad28b7d0f3297ae230c5f5e9f307e017 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/inference.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import time +import glob +import json +import random +import argparse +import numpy as np +from tqdm import tqdm + +import torch +import paddle +import tensorrt +from tensorrt import Dims +from cuda import cuda, cudart + +from transform import Transformation +from dataset import LibriSpeech +from decoder import CTCDecoder + +from utils import VOCABLIST as vocab_list +from utils.error_rate import wer +from utils import deepspeech2_trtapi_ixrt, setup_io_bindings +from load_ixrt_plugin import load_ixrt_plugin + +load_ixrt_plugin() + +def parse_config(): + parser = argparse.ArgumentParser(description="The DeepSpeech2 network Inference on LibriSpeech dataset.") + parser.add_argument( + "--model_type", + type=str, + default="DeepSpeech2", + help="The speech recognition model(DeepSpeech2)", + ) + parser.add_argument( + "--preprocess_config", + type=str, + default="data/preprocess.yaml", + help="The preprocess input file", + ) + parser.add_argument( + "--engine_file", + type=str, + default="../../../../../data/checkpoints/deepspeech2/deepspeech2.engine", + help="engine file path" + ) + parser.add_argument( + "--decoder_file", + type=str, + default="../../../../../data/checkpoints/deepspeech2/decoder.pdparams", + help="ctcdecoder checkpoints file" + ) + parser.add_argument( + "--lang_model_path", + type=str, + default="../../../../../data/checkpoints/deepspeech2/lm/common_crawl_00.prune01111.trie.klm", + help="The language model path" + ) + # dataset + parser.add_argument( + '--dataroot', + default="../../../../../data/datasets/LibriSpeech", + help='location to download dataset(s)' + ) + parser.add_argument("--bsz", type=int, default=1, help="Dynamic input") + parser.add_argument("--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4") + parser.add_argument("--use_async", action="store_true") + parser.add_argument("--wer_target", type=float, default=-1.0) + parser.add_argument("--test_num_samples", type=int, default=-1) + + config = parser.parse_args() + return config + + +def test_result(data, engine, context, decoder, test_num_samples): + + input_name = "input" + output_name = "output" + + data_len = len(data) + wer_sum = 0.0 + + if test_num_samples != -1: + data_len = test_num_samples + + for i in tqdm(range(data_len), desc="Testing WER"): + + start_time = time.time() + audio, text = data[i] + audio_shape = audio.shape + # print(f"audio_shape: {audio_shape}") + + # Set the input shape + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(audio_shape)) + + inputs, outputs, allocations = setup_io_bindings(engine, context) + pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], audio, audio.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + if config.use_async: + stream = cuda.Stream() + context.execute_async_v2(allocations, stream.handle) + stream.synchronize() + else: + context.execute_v2(allocations) + + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + eouts = paddle.to_tensor(pred_output) + eouts_len = paddle.to_tensor([eouts.shape[1]]) + probs = decoder.softmax(eouts) + batch_size = probs.shape[0] + + decoder.init_decoder( + batch_size, + vocab_list, + "ctc_beam_search", + config.lang_model_path, + 1.9, + 0.3, + 500, + 1.0, + 40, + 8 + ) + decoder.reset_decoder(batch_size=batch_size) + decoder.next(probs, eouts_len) + trans_best, trans_beam = decoder.decode() + # print(f"result_transcripts: {trans_best}") + # print(f"text: {text}") + cur_wer = wer(text, trans_best[0], True) + print(f"wer: {cur_wer}") + wer_sum += cur_wer + + wer_avg = wer_sum / data_len + print(f"wer_avg: {wer_avg}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["wer_avg"] = round(wer_avg, 3) + print(metricResult) + return wer_avg + + +def main(config): + + # Step1:build dataset + preprocessing = Transformation(config.preprocess_config) + dataset = LibriSpeech(config.dataroot, preprocessing) + + # Step2: load engine + engine, context = deepspeech2_trtapi_ixrt(config.engine_file) + + # Step3: load decoder + decoder = CTCDecoder( + odim=31, + enc_n_units=2048, + blank_id=0, + dropout_rate=0.0, + reduction=True, + batch_average=True, + grad_norm_type=None + ) + decoder_state_dict = paddle.load(config.decoder_file) + decoder.set_state_dict(decoder_state_dict) + + # Step4: run test + wer = test_result(dataset, engine, context, decoder, config.test_num_samples) + status = 'Pass' if wer <= config.wer_target else 'Fail' + + print("="*30) + print(f"\nCheck AUC: Test : {wer} Target:{config.wer_target} State : {status}") + print("="*30) + + +if __name__ == "__main__": + config = parse_config() + main(config) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/inference_demo.py b/models/speech/speech_recognition/deepspeech2/ixrt/inference_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..3385eadcfeb4466407897a6d4cf94e1258888e49 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/inference_demo.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import time +import glob +import json +import random +import argparse +import soundfile +import numpy as np +from tqdm import tqdm + +import torch +import paddle +import tensorrt +from tensorrt import Dims +from cuda import cuda, cudart + +from transform import Transformation +from decoder import CTCDecoder + +from utils import VOCABLIST as vocab_list +from utils import deepspeech2_trtapi_ixrt, setup_io_bindings + +from load_ixrt_plugin import load_ixrt_plugin + +load_ixrt_plugin() + +def parse_config(): + parser = argparse.ArgumentParser(description="The DeepSpeech2 network Inference demo and performance.") + parser.add_argument( + "--model_type", + type=str, + default="DeepSpeech2", + help="The speech recognition model(DeepSpeech2)", + ) + parser.add_argument( + "--audio_file", + type=str, + default="data/demo_002_en.wav", + help="The input speech wave", + ) + parser.add_argument( + "--preprocess_config", + type=str, + default="data/preprocess.yaml", + help="The preprocess input file", + ) + parser.add_argument( + "--engine_file", + type=str, + default="../../../../../data/checkpoints/deepspeech2/deepspeech2.engine", + help="engine file path" + ) + parser.add_argument( + "--decoder_file", + type=str, + default="../../../../../data/checkpoints/deepspeech2/decoder.pdparams", + help="ctcdecoder checkpoints file" + ) + parser.add_argument( + "--lang_model_path", + type=str, + default="../../../../../data/checkpoints/deepspeech2/lm/common_crawl_00.prune01111.trie.klm", + help="The language model path" + ) + parser.add_argument("--bsz", type=int, default=1, help="Dynamic input") + parser.add_argument("--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4") + parser.add_argument("--use_async", action="store_true") + parser.add_argument("--run_loop", type=int, default=-1) + parser.add_argument("--warm_up", type=int, default=-1) + parser.add_argument("--throughput_target", type=float, default=-1.0) + + config = parser.parse_args() + return config + + +def main(config): + # Step1: Load the input wave + assert os.path.isfile(config.audio_file), "The input audio file must be existed!" + audio, sample_rate = soundfile.read(config.audio_file, dtype="int16", always_2d=True) + audio = audio[:, 0] + print(f"audio shape: {audio.shape}") + + # fbank + preprocess_args = {"train": False} + preprocessing = Transformation(config.preprocess_config) + input_data = preprocessing(audio, **preprocess_args) + input_data = np.expand_dims(input_data.astype(np.float32), axis=0) + print(f"feat shape: {input_data.shape}") + + # Step2: Load the engine + engine, context = deepspeech2_trtapi_ixrt(config.engine_file) + + input_shape = input_data.shape + print("input shape: ", input_shape) + + input_idx = engine.get_binding_index("input") + context.set_binding_shape(input_idx, Dims(input_shape)) + + inputs, outputs, allocations = setup_io_bindings(engine, context) + pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + + err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], input_data, input_data.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + print("\n Warm Up Start.") + for i in range(config.warm_up): context.execute_v2(allocations) + print("Warm Up Done.") + + run_times = [] + for i in range(config.run_loop): + start_time = time.time() + context.execute_v2(allocations) + end_time = time.time() + run_times.append(end_time - start_time) + + run_times.remove(max(run_times)) + run_times.remove(min(run_times)) + + avg_time = sum(run_times) / len(run_times) + throughput = pred_output.shape[1] / avg_time + print(f"Executing {config.run_loop} done, Time: {avg_time}, ThroughPut: {throughput}") + + err, = cuda.cuMemcpyDtoH(pred_output, outputs[0]["allocation"], outputs[0]["nbytes"]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + + # Step3: Load the CTCDecoder + decoder = CTCDecoder( + odim=31, + enc_n_units=2048, + blank_id=0, + dropout_rate=0.0, + reduction=True, + batch_average=True, + grad_norm_type=None + ) + decoder_state_dict = paddle.load(config.decoder_file) + decoder.set_state_dict(decoder_state_dict) + + eouts = paddle.to_tensor(pred_output) + eouts_len = paddle.to_tensor([eouts.shape[1]]) + probs = decoder.softmax(eouts) + batch_size = probs.shape[0] + + decoder.init_decoder( + batch_size, + vocab_list, + "ctc_beam_search", + config.lang_model_path, + 1.9, + 0.3, + 500, + 1.0, + 40, + 8 + ) + + decoder.reset_decoder(batch_size=batch_size) + decoder.next(probs, eouts_len) + trans_best, trans_beam = decoder.decode() + print("result_transcripts: ", trans_best) + + status = 'Pass' if throughput >= config.throughput_target else 'Fail' + + print("="*30) + print(f"\nCheck ThroughPut: Test : {throughput} Target:{config.throughput_target} State : {status}") + print("="*30) + + metricResult = {"metricResult": {}} + metricResult["metricResult"]["ThroughPut"] = round(throughput, 3) + print(metricResult) + + +if __name__ == "__main__": + config = parse_config() + main(config) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/load_ixrt_plugin.py b/models/speech/speech_recognition/deepspeech2/ixrt/load_ixrt_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..b40f69103ed16c1a2ec127fd5b9344f4b079fdce --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/load_ixrt_plugin.py @@ -0,0 +1,13 @@ +from os.path import join, dirname, exists +import tensorrt as trt +import ctypes + +def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) + trt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") \ No newline at end of file diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/modify_model_to_dynamic.py b/models/speech/speech_recognition/deepspeech2/ixrt/modify_model_to_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..b168d3c48100442eadf7054ce85e21163ea56f64 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/modify_model_to_dynamic.py @@ -0,0 +1,28 @@ +import onnx +from onnx import helper +import argparse + +def modify_to_dynamic(arg): + model = onnx.load(args.static_onnx) + + graph = model.graph + for input_node in graph.input: + if input_node.name == 'input': + input_shape = input_node.type.tensor_type.shape.dim + input_shape[0].dim_value = 1 + input_shape[1].dim_param = 'None' + input_shape[2].dim_value = 161 + + onnx.save(model, args.dynamic_onnx) + onnx.checker.check_model(model, full_check=True) + +def parse_args(): + parser = argparse.ArgumentParser(description="modify static shape to dynamic for deepspeech2") + parser.add_argument("--static_onnx", type=str, required=True, help="The input static onnx path") + parser.add_argument("--dynamic_onnx", type=str, required=True, help="The ouput dynamic onnx path") + args = parser.parse_args() + return args + +if __name__ == "__main__": + args = parse_args() + modify_to_dynamic(args) \ No newline at end of file diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_accuracy.sh b/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..1d10f6f3f4b10729dcf15d85e3bf3d07ec7c475d --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_accuracy.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +DATASETS_DIR="${DATASETS_DIR:-/path/to/LibriSpeech}" +CHECKPOINTS_DIR="${CHECKPOINTS_DIR:-./checkpoints}" +RUN_DIR="${RUN_DIR:-.}" + +TGT=-1 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --tgt) TGT=${arguments[index]};; + esac +done + +cd ${RUN_DIR} +python3 inference.py \ + --model_type "deepspeech2" \ + --engine_file "${CHECKPOINTS_DIR}/deepspeech2.engine" \ + --decoder_file "data/decoder.pdparams" \ + --lang_model_path "${CHECKPOINTS_DIR}/common_crawl_00.prune01111.trie.klm" \ + --dataroot "${DATASETS_DIR}" \ + --wer_target ${TGT} \ + --test_num_samples 500 \ No newline at end of file diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_performance.sh b/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..e3adad2ac82b420ffb8b9572fcd423af13d8b2d6 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/scripts/infer_deepspeech2_fp16_performance.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) 2026, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +DATASETS_DIR="${DATASETS_DIR:-/path/to/LibriSpeech}" +CHECKPOINTS_DIR="${CHECKPOINTS_DIR:-./checkpoints}" +RUN_DIR="${RUN_DIR:-.}" + +TGT=-1 +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --tgt) TGT=${arguments[index]};; + esac +done + +cd ${RUN_DIR} +python3 inference_demo.py \ + --model_type "deepspeech2" \ + --engine_file "${CHECKPOINTS_DIR}/deepspeech2.engine" \ + --decoder_file "data/decoder.pdparams" \ + --lang_model_path "${CHECKPOINTS_DIR}/common_crawl_00.prune01111.trie.klm" \ + --run_loop 12 \ + --warm_up 5 \ + --throughput_target ${TGT} \ No newline at end of file diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/__init__.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cbf4f9ce30dd180fa77d5821823c27e1a611d34e --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/__init__.py @@ -0,0 +1,2 @@ +from .transformation import Transformation + diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/cmvn.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/cmvn.py new file mode 100644 index 0000000000000000000000000000000000000000..ab75807b00e019148aa213f12ba28e6634a80dfe --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/cmvn.py @@ -0,0 +1,187 @@ +import io +import json + +import h5py +import kaldiio +import numpy as np + + +class CMVN(): + "Apply Global/Spk CMVN/iverserCMVN." + + def __init__( + self, + stats, + norm_means=True, + norm_vars=False, + filetype="mat", + utt2spk=None, + spk2utt=None, + reverse=False, + std_floor=1.0e-20, ): + self.stats_file = stats + self.norm_means = norm_means + self.norm_vars = norm_vars + self.reverse = reverse + + if isinstance(stats, dict): + stats_dict = dict(stats) + else: + # Use for global CMVN + if filetype == "mat": + stats_dict = {None: kaldiio.load_mat(stats)} + # Use for global CMVN + elif filetype == "npy": + stats_dict = {None: np.load(stats)} + # Use for speaker CMVN + elif filetype == "ark": + self.accept_uttid = True + stats_dict = dict(kaldiio.load_ark(stats)) + # Use for speaker CMVN + elif filetype == "hdf5": + self.accept_uttid = True + stats_dict = h5py.File(stats) + else: + raise ValueError("Not supporting filetype={}".format(filetype)) + + if utt2spk is not None: + self.utt2spk = {} + with io.open(utt2spk, "r", encoding="utf-8") as f: + for line in f: + utt, spk = line.rstrip().split(None, 1) + self.utt2spk[utt] = spk + elif spk2utt is not None: + self.utt2spk = {} + with io.open(spk2utt, "r", encoding="utf-8") as f: + for line in f: + spk, utts = line.rstrip().split(None, 1) + for utt in utts.split(): + self.utt2spk[utt] = spk + else: + self.utt2spk = None + + # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1), + # and the first vector contains the sum of feats and the second is + # the sum of squares. The last value of the first, i.e. stats[0,-1], + # is the number of samples for this statistics. + self.bias = {} + self.scale = {} + for spk, stats in stats_dict.items(): + assert len(stats) == 2, stats.shape + + count = stats[0, -1] + + # If the feature has two or more dimensions + if not (np.isscalar(count) or isinstance(count, (int, float))): + # The first is only used + count = count.flatten()[0] + + mean = stats[0, :-1] / count + # V(x) = E(x^2) - (E(x))^2 + var = stats[1, :-1] / count - mean * mean + std = np.maximum(np.sqrt(var), std_floor) + self.bias[spk] = -mean + self.scale[spk] = 1 / std + + def __repr__(self): + return ("{name}(stats_file={stats_file}, " + "norm_means={norm_means}, norm_vars={norm_vars}, " + "reverse={reverse})".format( + name=self.__class__.__name__, + stats_file=self.stats_file, + norm_means=self.norm_means, + norm_vars=self.norm_vars, + reverse=self.reverse, )) + + def __call__(self, x, uttid=None): + if self.utt2spk is not None: + spk = self.utt2spk[uttid] + else: + spk = uttid + + if not self.reverse: + # apply cmvn + if self.norm_means: + x = np.add(x, self.bias[spk]) + if self.norm_vars: + x = np.multiply(x, self.scale[spk]) + + else: + # apply reverse cmvn + if self.norm_vars: + x = np.divide(x, self.scale[spk]) + if self.norm_means: + x = np.subtract(x, self.bias[spk]) + + return x + + +class UtteranceCMVN(): + "Apply Utterance CMVN" + + def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20): + self.norm_means = norm_means + self.norm_vars = norm_vars + self.std_floor = std_floor + + def __repr__(self): + return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format( + name=self.__class__.__name__, + norm_means=self.norm_means, + norm_vars=self.norm_vars, ) + + def __call__(self, x, uttid=None): + # x: [Time, Dim] + square_sums = (x**2).sum(axis=0) + mean = x.mean(axis=0) + + if self.norm_means: + x = np.subtract(x, mean) + + if self.norm_vars: + var = square_sums / x.shape[0] - mean**2 + std = np.maximum(np.sqrt(var), self.std_floor) + x = np.divide(x, std) + + return x + + +class GlobalCMVN(): + "Apply Global CMVN" + + def __init__(self, + cmvn_path, + norm_means=True, + norm_vars=True, + std_floor=1.0e-20): + # cmvn_path: Option[str, dict] + cmvn = cmvn_path + self.cmvn = cmvn + self.norm_means = norm_means + self.norm_vars = norm_vars + self.std_floor = std_floor + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) + self.count = cmvn_stats['frame_num'] + self.mean = np.array(cmvn_stats['mean_stat']) / self.count + self.square_sums = np.array(cmvn_stats['var_stat']) + self.var = self.square_sums / self.count - self.mean**2 + self.std = np.maximum(np.sqrt(self.var), self.std_floor) + + def __repr__(self): + return f"""{self.__class__.__name__}( + cmvn_path={self.cmvn}, + norm_means={self.norm_means}, + norm_vars={self.norm_vars},)""" + + def __call__(self, x, uttid=None): + # x: [Time, Dim] + if self.norm_means: + x = np.subtract(x, self.mean) + + if self.norm_vars: + x = np.divide(x, self.std) + return x diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/functional.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..688a0bede78787479a4d05b75ee80bf34bfb6e7d --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/functional.py @@ -0,0 +1,110 @@ +import inspect + + +def check_kwargs(func, kwargs, name=None): + """check kwargs are valid for func + + If kwargs are invalid, raise TypeError as same as python default + :param function func: function to be validated + :param dict kwargs: keyword arguments for func + :param str name: name used in TypeError (default is func name) + """ + try: + params = inspect.signature(func).parameters + except ValueError: + return + if name is None: + name = func.__name__ + for k in kwargs.keys(): + if k not in params: + raise TypeError( + f"{name}() got an unexpected keyword argument '{k}'") + + +class TransformInterface: + """Transform Interface""" + + def __call__(self, x): + raise NotImplementedError("__call__ method is not implemented") + + @classmethod + def add_arguments(cls, parser): + return parser + + def __repr__(self): + return self.__class__.__name__ + "()" + + +class Identity(TransformInterface): + """Identity Function""" + + def __call__(self, x): + return x + + +class FuncTrans(TransformInterface): + """Functional Transformation + + WARNING: + Builtin or C/C++ functions may not work properly + because this class heavily depends on the `inspect` module. + + Usage: + + >>> def foo_bar(x, a=1, b=2): + ... '''Foo bar + ... :param x: input + ... :param int a: default 1 + ... :param int b: default 2 + ... ''' + ... return x + a - b + + + >>> class FooBar(FuncTrans): + ... _func = foo_bar + ... __doc__ = foo_bar.__doc__ + """ + + _func = None + + def __init__(self, **kwargs): + self.kwargs = kwargs + check_kwargs(self.func, kwargs) + + def __call__(self, x): + return self.func(x, **self.kwargs) + + @classmethod + def add_arguments(cls, parser): + fname = cls._func.__name__.replace("_", "-") + group = parser.add_argument_group(fname + " transformation setting") + for k, v in cls.default_params().items(): + # TODO(karita): get help and choices from docstring? + attr = k.replace("_", "-") + group.add_argument(f"--{fname}-{attr}", default=v, type=type(v)) + return parser + + @property + def func(self): + return type(self)._func + + @classmethod + def default_params(cls): + try: + d = dict(inspect.signature(cls._func).parameters) + except ValueError: + d = dict() + return { + k: v.default + for k, v in d.items() if v.default != inspect.Parameter.empty + } + + def __repr__(self): + params = self.default_params() + params.update(**self.kwargs) + ret = self.__class__.__name__ + "(" + if len(params) == 0: + return ret + ")" + for k, v in params.items(): + ret += "{}={}, ".format(k, v) + return ret[:-2] + ")" diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/spec_augment.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/spec_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..e83efa12ec5fe38034c1f964042ae5f457af1077 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/spec_augment.py @@ -0,0 +1,193 @@ +"""Spec Augment module for preprocessing i.e., data augmentation""" +import random + +import numpy +from PIL import Image + +from .functional import FuncTrans + + +def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): + """time warp for spec augment + + move random center frame by the random width ~ uniform(-window, window) + :param numpy.ndarray x: spectrogram (time, freq) + :param int max_time_warp: maximum time frames to warp + :param bool inplace: overwrite x with the result + :param str mode: "PIL" (default, fast, not differentiable) or "sparse_image_warp" + (slow, differentiable) + :returns numpy.ndarray: time warped spectrogram (time, freq) + """ + window = max_time_warp + if window == 0: + return x + + if mode == "PIL": + t = x.shape[0] + if t - window <= window: + return x + # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1 + center = random.randrange(window, t - window) + warped = random.randrange(center - window, center + + window) + 1 # 1 ... t - 1 + + left = Image.fromarray(x[:center]).resize((x.shape[1], warped), + Image.BICUBIC) + right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), + Image.BICUBIC) + if inplace: + x[:warped] = left + x[warped:] = right + return x + return numpy.concatenate((left, right), 0) + else: + raise NotImplementedError("unknown resize mode: " + mode + + ", choose one from (PIL, sparse_image_warp).") + + +class TimeWarp(FuncTrans): + _func = time_warp + __doc__ = time_warp.__doc__ + + def __call__(self, x, train): + if not train: + return x + return super().__call__(x) + + +def freq_mask(x, F=30, n_mask=2, replace_with_zero=True, inplace=False): + """freq mask for spec agument + + :param numpy.ndarray x: (time, freq) + :param int n_mask: the number of masks + :param bool inplace: overwrite + :param bool replace_with_zero: pad zero on mask if true else use mean + """ + if inplace: + cloned = x + else: + cloned = x.copy() + + num_mel_channels = cloned.shape[1] + fs = numpy.random.randint(0, F, size=(n_mask, 2)) + + for f, mask_end in fs: + f_zero = random.randrange(0, num_mel_channels - f) + mask_end += f_zero + + # avoids randrange error if values are equal and range is empty + if f_zero == f_zero + f: + continue + + if replace_with_zero: + cloned[:, f_zero:mask_end] = 0 + else: + cloned[:, f_zero:mask_end] = cloned.mean() + return cloned + + +class FreqMask(FuncTrans): + _func = freq_mask + __doc__ = freq_mask.__doc__ + + def __call__(self, x, train): + if not train: + return x + return super().__call__(x) + + +def time_mask(spec, T=40, n_mask=2, replace_with_zero=True, inplace=False): + """freq mask for spec agument + + :param numpy.ndarray spec: (time, freq) + :param int n_mask: the number of masks + :param bool inplace: overwrite + :param bool replace_with_zero: pad zero on mask if true else use mean + """ + if inplace: + cloned = spec + else: + cloned = spec.copy() + len_spectro = cloned.shape[0] + ts = numpy.random.randint(0, T, size=(n_mask, 2)) + for t, mask_end in ts: + # avoid randint range error + if len_spectro - t <= 0: + continue + t_zero = random.randrange(0, len_spectro - t) + + # avoids randrange error if values are equal and range is empty + if t_zero == t_zero + t: + continue + + mask_end += t_zero + if replace_with_zero: + cloned[t_zero:mask_end] = 0 + else: + cloned[t_zero:mask_end] = cloned.mean() + return cloned + + +class TimeMask(FuncTrans): + _func = time_mask + __doc__ = time_mask.__doc__ + + def __call__(self, x, train): + if not train: + return x + return super().__call__(x) + + +def spec_augment( + x, + resize_mode="PIL", + max_time_warp=80, + max_freq_width=27, + n_freq_mask=2, + max_time_width=100, + n_time_mask=2, + inplace=True, + replace_with_zero=True, ): + """spec agument + + apply random time warping and time/freq masking + default setting is based on LD (Librispeech double) in Table 2 + https://arxiv.org/pdf/1904.08779.pdf + + :param numpy.ndarray x: (time, freq) + :param str resize_mode: "PIL" (fast, nondifferentiable) or "sparse_image_warp" + (slow, differentiable) + :param int max_time_warp: maximum frames to warp the center frame in spectrogram (W) + :param int freq_mask_width: maximum width of the random freq mask (F) + :param int n_freq_mask: the number of the random freq mask (m_F) + :param int time_mask_width: maximum width of the random time mask (T) + :param int n_time_mask: the number of the random time mask (m_T) + :param bool inplace: overwrite intermediate array + :param bool replace_with_zero: pad zero on mask if true else use mean + """ + assert isinstance(x, numpy.ndarray) + assert x.ndim == 2 + x = time_warp(x, max_time_warp, inplace=inplace, mode=resize_mode) + x = freq_mask( + x, + max_freq_width, + n_freq_mask, + inplace=inplace, + replace_with_zero=replace_with_zero, ) + x = time_mask( + x, + max_time_width, + n_time_mask, + inplace=inplace, + replace_with_zero=replace_with_zero, ) + return x + + +class SpecAugment(FuncTrans): + _func = spec_augment + __doc__ = spec_augment.__doc__ + + def __call__(self, x, train): + if not train: + return x + return super().__call__(x) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/spectrogram.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/spectrogram.py new file mode 100644 index 0000000000000000000000000000000000000000..0c2f0636bbdd5ac279decdb1ac2268f9a1b4e726 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/spectrogram.py @@ -0,0 +1,505 @@ +import librosa +import numpy as np +# import torch +# from torchaudio.compliance import kaldi +import paddle +from paddleaudio.compliance import kaldi +from python_speech_features import logfbank + + +def stft(x, + n_fft, + n_shift, + win_length=None, + window="hann", + center=True, + pad_mode="reflect"): + # x: [Time, Channel] + if x.ndim == 1: + single_channel = True + # x: [Time] -> [Time, Channel] + x = x[:, None] + else: + single_channel = False + x = x.astype(np.float32) + + # FIXME(kamo): librosa.stft can't use multi-channel? + # x: [Time, Channel, Freq] + x = np.stack( + [ + librosa.stft( + y=x[:, ch], + n_fft=n_fft, + hop_length=n_shift, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, ).T for ch in range(x.shape[1]) + ], + axis=1, ) + + if single_channel: + # x: [Time, Channel, Freq] -> [Time, Freq] + x = x[:, 0] + return x + + +def istft(x, n_shift, win_length=None, window="hann", center=True): + # x: [Time, Channel, Freq] + if x.ndim == 2: + single_channel = True + # x: [Time, Freq] -> [Time, Channel, Freq] + x = x[:, None, :] + else: + single_channel = False + + # x: [Time, Channel] + x = np.stack( + [ + librosa.istft( + stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time] + hop_length=n_shift, + win_length=win_length, + window=window, + center=center, ) for ch in range(x.shape[1]) + ], + axis=1, ) + + if single_channel: + # x: [Time, Channel] -> [Time] + x = x[:, 0] + return x + + +def stft2logmelspectrogram(x_stft, + fs, + n_mels, + n_fft, + fmin=None, + fmax=None, + eps=1e-10): + # x_stft: (Time, Channel, Freq) or (Time, Freq) + fmin = 0 if fmin is None else fmin + fmax = fs / 2 if fmax is None else fmax + + # spc: (Time, Channel, Freq) or (Time, Freq) + spc = np.abs(x_stft) + # mel_basis: (Mel_freq, Freq) + mel_basis = librosa.filters.mel( + sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) + # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) + lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) + + return lmspc + + +def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"): + # x: (Time, Channel) -> spc: (Time, Channel, Freq) + spc = np.abs(stft(x, n_fft, n_shift, win_length, window=window)) + return spc + + +def logmelspectrogram( + x, + fs, + n_mels, + n_fft, + n_shift, + win_length=None, + window="hann", + fmin=None, + fmax=None, + eps=1e-10, + pad_mode="reflect", ): + # stft: (Time, Channel, Freq) or (Time, Freq) + x_stft = stft( + x, + n_fft=n_fft, + n_shift=n_shift, + win_length=win_length, + window=window, + pad_mode=pad_mode, ) + + return stft2logmelspectrogram( + x_stft, + fs=fs, + n_mels=n_mels, + n_fft=n_fft, + fmin=fmin, + fmax=fmax, + eps=eps) + + +class Spectrogram(): + def __init__(self, n_fft, n_shift, win_length=None, window="hann"): + self.n_fft = n_fft + self.n_shift = n_shift + self.win_length = win_length + self.window = window + + def __repr__(self): + return ("{name}(n_fft={n_fft}, n_shift={n_shift}, " + "win_length={win_length}, window={window})".format( + name=self.__class__.__name__, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, )) + + def __call__(self, x): + return spectrogram( + x, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, ) + + +class LogMelSpectrogram(): + def __init__( + self, + fs, + n_mels, + n_fft, + n_shift, + win_length=None, + window="hann", + fmin=None, + fmax=None, + eps=1e-10, ): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + self.n_shift = n_shift + self.win_length = win_length + self.window = window + self.fmin = fmin + self.fmax = fmax + self.eps = eps + + def __repr__(self): + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) + + def __call__(self, x): + return logmelspectrogram( + x, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, ) + + +class Stft2LogMelSpectrogram(): + def __init__(self, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + self.fmin = fmin + self.fmax = fmax + self.eps = eps + + def __repr__(self): + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) + + def __call__(self, x): + return stft2logmelspectrogram( + x, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + fmin=self.fmin, + fmax=self.fmax, ) + + +class Stft(): + def __init__( + self, + n_fft, + n_shift, + win_length=None, + window="hann", + center=True, + pad_mode="reflect", ): + self.n_fft = n_fft + self.n_shift = n_shift + self.win_length = win_length + self.window = window + self.center = center + self.pad_mode = pad_mode + + def __repr__(self): + return ("{name}(n_fft={n_fft}, n_shift={n_shift}, " + "win_length={win_length}, window={window}," + "center={center}, pad_mode={pad_mode})".format( + name=self.__class__.__name__, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, + pad_mode=self.pad_mode, )) + + def __call__(self, x): + return stft( + x, + self.n_fft, + self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, + pad_mode=self.pad_mode, ) + + +class IStft(): + def __init__(self, n_shift, win_length=None, window="hann", center=True): + self.n_shift = n_shift + self.win_length = win_length + self.window = window + self.center = center + + def __repr__(self): + return ("{name}(n_shift={n_shift}, " + "win_length={win_length}, window={window}," + "center={center})".format( + name=self.__class__.__name__, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, )) + + def __call__(self, x): + return istft( + x, + self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, ) + + +class LogMelSpectrogramKaldi(): + def __init__( + self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.1): + """ + The Kaldi implementation of LogMelSpectrogram + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant + + Returns: + LogMelSpectrogramKaldi + """ + + self.fs = fs + self.n_mels = n_mels + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def __call__(self, x, train): + """ + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + dither = self.dither if train else 0.0 + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + + + # torchaudio + """ + waveform = torch.from_numpy(np.expand_dims(x, 0)).type(torch.float32) + mat = kaldi.fbank( + waveform, + num_mel_bins=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=dither, + energy_floor=self.energy_floor, + sample_frequency=self.fs) + """ + # paddlespeech + waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) + mat = kaldi.fbank( + waveform, + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=dither, + energy_floor=self.energy_floor, + sr=self.fs) + mat = np.squeeze(mat.numpy()) + return mat + + +class WavProcess(): + def __init__(self): + """ + Args: + dither (float): Dithering constant + + Returns: + """ + + def __call__(self, x): + """ + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + waveform = x.astype("float32") / 32768.0 + waveform = np.expand_dims(waveform, -1) + return waveform + + +class LogMelSpectrogramKaldi_decay(): + def __init__( + self, + fs=16000, + n_mels=80, + n_fft=512, # fft point + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + window="povey", + fmin=20, + fmax=None, + eps=1e-10, + dither=1.0): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + if n_shift > win_length: + raise ValueError("Stride size must not be greater than " + "window size.") + self.n_shift = n_shift / fs # unit: ms + self.win_length = win_length / fs # unit: ms + + self.window = window + self.fmin = fmin + if fmax is None: + fmax_ = fmax if fmax else self.fs / 2 + elif fmax > int(self.fs / 2): + raise ValueError("fmax must not be greater than half of " + "sample rate.") + self.fmax = fmax_ + + self.eps = eps + self.remove_dc_offset = True + self.preemph = 0.97 + self.dither = dither # only work in train mode + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + preemph=self.preemph, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, + dither=self.dither, )) + + def __call__(self, x, train): + """ + + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + dither = self.dither if train else 0.0 + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + + if x.dtype in np.sctypes['float']: + # PCM32 -> PCM16 + bits = np.iinfo(np.int16).bits + x = x * 2**(bits - 1) + + # logfbank need PCM16 input + y = logfbank( + signal=x, + samplerate=self.fs, + winlen=self.win_length, # unit ms + winstep=self.n_shift, # unit ms + nfilt=self.n_mels, + nfft=self.n_fft, + lowfreq=self.fmin, + highfreq=self.fmax, + dither=dither, + remove_dc_offset=self.remove_dc_offset, + preemph=self.preemph, + wintype=self.window) + return y diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/transform/transformation.py b/models/speech/speech_recognition/deepspeech2/ixrt/transform/transformation.py new file mode 100644 index 0000000000000000000000000000000000000000..39e97816e228283ad411c160913333047931d152 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/transform/transformation.py @@ -0,0 +1,156 @@ +"""Transformation module.""" +import copy +import yaml +import io +import logging +import importlib +from collections import OrderedDict +from collections.abc import Sequence +from inspect import signature + + +import_alias = dict( + identity="transform.functional:Identity", + time_warp="transform.spec_augment:TimeWarp", + time_mask="transform.spec_augment:TimeMask", + freq_mask="transform.spec_augment:FreqMask", + cmvn="transform.cmvn:CMVN", + fbank="transform.spectrogram:LogMelSpectrogram", + spectrogram="transform.spectrogram:Spectrogram", + wav_process="transform.spectrogram:WavProcess", + stft="transform.spectrogram:Stft", + istft="transform.spectrogram:IStft", + stft2fbank="transform.spectrogram:Stft2LogMelSpectrogram", + fbank_kaldi="transform.spectrogram:LogMelSpectrogramKaldi", + cmvn_json="transform.cmvn:GlobalCMVN") + + +def dynamic_import(import_path, alias=dict()): + """dynamic import module and class + + :param str import_path: syntax 'module_name:class_name' + e.g., 'paddlespeech.s2t.models.u2:U2Model' + :param dict alias: shortcut for registered class + :return: imported class + """ + if import_path not in alias and ":" not in import_path: + raise ValueError( + "import_path should be one of {} or " + 'include ":", e.g. "paddlespeech.s2t.models.u2:U2Model" : ' + "{}".format(set(alias), import_path)) + if ":" not in import_path: + import_path = alias[import_path] + + module_name, objname = import_path.split(":") + m = importlib.import_module(module_name) + return getattr(m, objname) + + +class Transformation(): + """Apply some functions to the mini-batch + + Examples: + >>> kwargs = {"process": [{"type": "fbank", + ... "n_mels": 80, + ... "fs": 16000}, + ... {"type": "cmvn", + ... "stats": "data/train/cmvn.ark", + ... "norm_vars": True}, + ... {"type": "delta", "window": 2, "order": 2}]} + >>> transform = Transformation(kwargs) + >>> bs = 10 + >>> xs = [np.random.randn(100, 80).astype(np.float32) + ... for _ in range(bs)] + >>> xs = transform(xs) + """ + + def __init__(self, conffile=None): + if conffile is not None: + if isinstance(conffile, dict): + self.conf = copy.deepcopy(conffile) + else: + with io.open(conffile, encoding="utf-8") as f: + self.conf = yaml.safe_load(f) + assert isinstance(self.conf, dict), type(self.conf) + else: + self.conf = {"mode": "sequential", "process": []} + + self.functions = OrderedDict() + if self.conf.get("mode", "sequential") == "sequential": + for idx, process in enumerate(self.conf["process"]): + assert isinstance(process, dict), type(process) + opts = dict(process) + process_type = opts.pop("type") + class_obj = dynamic_import(process_type, import_alias) + # TODO(karita): assert issubclass(class_obj, TransformInterface) + try: + self.functions[idx] = class_obj(**opts) + except TypeError: + try: + signa = signature(class_obj) + except ValueError: + # Some function, e.g. built-in function, are failed + pass + else: + logging.error("Expected signature: {}({})".format( + class_obj.__name__, signa)) + raise + else: + raise NotImplementedError( + "Not supporting mode={}".format(self.conf["mode"])) + + def __repr__(self): + rep = "\n" + "\n".join(" {}: {}".format(k, v) + for k, v in self.functions.items()) + return "{}({})".format(self.__class__.__name__, rep) + + def __call__(self, xs, uttid_list=None, **kwargs): + """Return new mini-batch + + :param Union[Sequence[np.ndarray], np.ndarray] xs: + :param Union[Sequence[str], str] uttid_list: + :return: batch: + :rtype: List[np.ndarray] + """ + if not isinstance(xs, Sequence): + is_batch = False + xs = [xs] + else: + is_batch = True + + if isinstance(uttid_list, str): + uttid_list = [uttid_list for _ in range(len(xs))] + + if self.conf.get("mode", "sequential") == "sequential": + for idx in range(len(self.conf["process"])): + func = self.functions[idx] + + # TODO(karita): use TrainingTrans and UttTrans to check __call__ args + # Derive only the args which the func has + try: + param = signature(func).parameters + except ValueError: + # Some function, e.g. built-in function, are failed + param = {} + _kwargs = {k: v for k, v in kwargs.items() if k in param} + try: + if uttid_list is not None and "uttid" in param: + xs = [ + func(x, u, **_kwargs) + for x, u in zip(xs, uttid_list) + ] + else: + xs = [func(x, **_kwargs) for x in xs] + + except Exception: + logging.fatal("Catch a exception from {}th func: {}".format( + idx, func)) + raise + else: + raise NotImplementedError( + "Not supporting mode={}".format(self.conf["mode"])) + + if is_batch: + return xs + else: + return xs[0] diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/utils/__init__.py b/models/speech/speech_recognition/deepspeech2/ixrt/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..309f97ed17605a7ee8b945efb9c90bc7245f7286 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/utils/__init__.py @@ -0,0 +1,16 @@ +from .load_tensorrt import deepspeech2_trtapi_ixrt, setup_io_bindings + + +VOCABLIST = ['', + '', + "'", + '', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + ''] + + +def check_target(inference, target): + satisfied = False + if inference > target: + satisfied = True + return satisfied diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/utils/error_rate.py b/models/speech/speech_recognition/deepspeech2/ixrt/utils/error_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..1e81f9111703bfba5a5ffb73905dd7d4376e2f39 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/utils/error_rate.py @@ -0,0 +1,351 @@ +"""This module provides functions to calculate error rate in different level. +e.g. wer for word-level, cer for char-level. +""" +from itertools import groupby + +import editdistance +import numpy as np + +__all__ = ['word_errors', 'char_errors', 'wer', 'cer', "ErrorCalculator"] + + +def _levenshtein_distance(ref, hyp): + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. + """ + m = len(ref) + n = len(hyp) + + # special case + if ref == hyp: + return 0 + if m == 0: + return n + if n == 0: + return m + + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) + + # initialize distance matrix + for j in range(n + 1): + distance[0][j] = j + + # calculate levenshtein distance + for i in range(1, m + 1): + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i + for j in range(1, n + 1): + if ref[i - 1] == hyp[j - 1]: + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] + else: + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) + + return distance[m % 2][n] + + +def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in word-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Levenshtein distance and word number of reference sentence. + :rtype: list + """ + if ignore_case: + reference = reference.lower() + hypothesis = hypothesis.lower() + + ref_words = list(filter(None, reference.split(delimiter))) + hyp_words = list(filter(None, hypothesis.split(delimiter))) + + edit_distance = _levenshtein_distance(ref_words, hyp_words) + # `editdistance.eavl precision` less than `_levenshtein_distance` + # edit_distance = editdistance.eval(ref_words, hyp_words) + return float(edit_distance), len(ref_words) + + +def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in char-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Levenshtein distance and length of reference sentence. + :rtype: list + """ + if ignore_case: + reference = reference.lower() + hypothesis = hypothesis.lower() + + join_char = ' ' + if remove_space: + join_char = '' + + reference = join_char.join(list(filter(None, reference.split(' ')))) + hypothesis = join_char.join(list(filter(None, hypothesis.split(' ')))) + + edit_distance = _levenshtein_distance(reference, hypothesis) + # `editdistance.eavl precision` less than `_levenshtein_distance` + # edit_distance = editdistance.eval(reference, hypothesis) + return float(edit_distance), len(reference) + + +def wer(reference, hypothesis, ignore_case=False, delimiter=' '): + """Calculate word error rate (WER). WER compares reference text and + hypothesis text in word-level. WER is defined as: + + .. math:: + WER = (Sw + Dw + Iw) / Nw + + where + + .. code-block:: text + + Sw is the number of words subsituted, + Dw is the number of words deleted, + Iw is the number of words inserted, + Nw is the number of words in the reference + + We can use levenshtein distance to calculate WER. Please draw an attention + that empty items will be removed when splitting sentences by delimiter. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Word error rate. + :rtype: float + :raises ValueError: If word number of reference is zero. + """ + edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, + delimiter) + + if ref_len == 0: + raise ValueError("Reference's word number should be greater than 0.") + + wer = float(edit_distance) / ref_len + return wer + + +def cer(reference, hypothesis, ignore_case=False, remove_space=False): + """Calculate charactor error rate (CER). CER compares reference text and + hypothesis text in char-level. CER is defined as: + + .. math:: + CER = (Sc + Dc + Ic) / Nc + + where + + .. code-block:: text + + Sc is the number of characters substituted, + Dc is the number of characters deleted, + Ic is the number of characters inserted + Nc is the number of characters in the reference + + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. Please draw an attention that the leading and tailing + space characters will be truncated and multiple consecutive space + characters in a sentence will be replaced by one space character. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Character error rate. + :rtype: float + :raises ValueError: If the reference length is zero. + """ + edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, + remove_space) + + if ref_len == 0: + raise ValueError("Length of reference should be greater than 0.") + + cer = float(edit_distance) / ref_len + return cer + + +class ErrorCalculator(): + """Calculate CER and WER for E2E_ASR and CTC models during training. + + :param y_hats: numpy array with predicted text + :param y_pads: numpy array with true (target) text + :param char_list: List[str] + :param sym_space: + :param sym_blank: + :return: + """ + + def __init__(self, + char_list, + sym_space, + sym_blank, + report_cer=False, + report_wer=False): + """Construct an ErrorCalculator object.""" + super().__init__() + + self.report_cer = report_cer + self.report_wer = report_wer + + self.char_list = char_list + self.space = sym_space + self.blank = sym_blank + self.idx_blank = self.char_list.index(self.blank) + if self.space in self.char_list: + self.idx_space = self.char_list.index(self.space) + else: + self.idx_space = None + + def __call__(self, ys_hat, ys_pad, is_ctc=False): + """Calculate sentence-level WER/CER score. + + :param paddle.Tensor ys_hat: prediction (batch, seqlen) + :param paddle.Tensor ys_pad: reference (batch, seqlen) + :param bool is_ctc: calculate CER score for CTC + :return: sentence-level WER score + :rtype float + :return: sentence-level CER score + :rtype float + """ + cer, wer = None, None + if is_ctc: + return self.calculate_cer_ctc(ys_hat, ys_pad) + elif not self.report_cer and not self.report_wer: + return cer, wer + + seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad) + if self.report_cer: + cer = self.calculate_cer(seqs_hat, seqs_true) + + if self.report_wer: + wer = self.calculate_wer(seqs_hat, seqs_true) + return cer, wer + + def calculate_cer_ctc(self, ys_hat, ys_pad): + """Calculate sentence-level CER score for CTC. + + :param paddle.Tensor ys_hat: prediction (batch, seqlen) + :param paddle.Tensor ys_pad: reference (batch, seqlen) + :return: average sentence-level CER score + :rtype float + """ + cers, char_ref_lens = [], [] + for i, y in enumerate(ys_hat): + y_hat = [x[0] for x in groupby(y)] + y_true = ys_pad[i] + seq_hat, seq_true = [], [] + for idx in y_hat: + idx = int(idx) + if idx != -1 and idx != self.idx_blank and idx != self.idx_space: + seq_hat.append(self.char_list[int(idx)]) + + for idx in y_true: + idx = int(idx) + if idx != -1 and idx != self.idx_blank and idx != self.idx_space: + seq_true.append(self.char_list[int(idx)]) + + hyp_chars = "".join(seq_hat) + ref_chars = "".join(seq_true) + if len(ref_chars) > 0: + cers.append(editdistance.eval(hyp_chars, ref_chars)) + char_ref_lens.append(len(ref_chars)) + + cer_ctc = float(sum(cers)) / sum(char_ref_lens) if cers else None + return cer_ctc + + def convert_to_char(self, ys_hat, ys_pad): + """Convert index to character. + + :param paddle.Tensor seqs_hat: prediction (batch, seqlen) + :param paddle.Tensor seqs_true: reference (batch, seqlen) + :return: token list of prediction + :rtype list + :return: token list of reference + :rtype list + """ + seqs_hat, seqs_true = [], [] + for i, y_hat in enumerate(ys_hat): + y_true = ys_pad[i] + eos_true = np.where(y_true == -1)[0] + ymax = eos_true[0] if len(eos_true) > 0 else len(y_true) + # NOTE: padding index (-1) in y_true is used to pad y_hat + seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]] + seq_true = [ + self.char_list[int(idx)] for idx in y_true if int(idx) != -1 + ] + seq_hat_text = "".join(seq_hat).replace(self.space, " ") + seq_hat_text = seq_hat_text.replace(self.blank, "") + seq_true_text = "".join(seq_true).replace(self.space, " ") + seqs_hat.append(seq_hat_text) + seqs_true.append(seq_true_text) + return seqs_hat, seqs_true + + def calculate_cer(self, seqs_hat, seqs_true): + """Calculate sentence-level CER score. + + :param list seqs_hat: prediction + :param list seqs_true: reference + :return: average sentence-level CER score + :rtype float + """ + char_eds, char_ref_lens = [], [] + for i, seq_hat_text in enumerate(seqs_hat): + seq_true_text = seqs_true[i] + hyp_chars = seq_hat_text.replace(" ", "") + ref_chars = seq_true_text.replace(" ", "") + char_eds.append(editdistance.eval(hyp_chars, ref_chars)) + char_ref_lens.append(len(ref_chars)) + return float(sum(char_eds)) / sum(char_ref_lens) + + def calculate_wer(self, seqs_hat, seqs_true): + """Calculate sentence-level WER score. + + :param list seqs_hat: prediction + :param list seqs_true: reference + :return: average sentence-level WER score + :rtype float + """ + word_eds, word_ref_lens = [], [] + for i, seq_hat_text in enumerate(seqs_hat): + seq_true_text = seqs_true[i] + hyp_words = seq_hat_text.split() + ref_words = seq_true_text.split() + word_eds.append(editdistance.eval(hyp_words, ref_words)) + word_ref_lens.append(len(ref_words)) + return float(sum(word_eds)) / sum(word_ref_lens) diff --git a/models/speech/speech_recognition/deepspeech2/ixrt/utils/load_tensorrt.py b/models/speech/speech_recognition/deepspeech2/ixrt/utils/load_tensorrt.py new file mode 100644 index 0000000000000000000000000000000000000000..164a939437b0f28c2d1c544850aead9cc3bb9e59 --- /dev/null +++ b/models/speech/speech_recognition/deepspeech2/ixrt/utils/load_tensorrt.py @@ -0,0 +1,56 @@ +import numpy as np +import tensorrt +from tensorrt import Dims +from cuda import cuda, cudart + + +def deepspeech2_trtapi_ixrt(engine_file): + datatype = tensorrt.DataType.FLOAT + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + with open(engine_file, "rb") as f, tensorrt.Runtime(logger) as runtime: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + + +def setup_io_bindings(engine, context): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = context.get_binding_shape(i) + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + for s in shape: + size *= s + err, allocation = cudart.cudaMalloc(size) + assert err == cudart.cudaError_t.cudaSuccess + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size, + } + allocations.append(allocation) + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + return inputs, outputs, allocations + diff --git a/tests/model_info.json b/tests/model_info.json index 1eb8591fa3a56ccd51211c20095b16493a1344d7..18488e263d4db5fd703b36fa6cc07fe9ac21df72 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -10424,6 +10424,207 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "ViT", + "model_name": "vit", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "cv/classification", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/cv/classification/vit/ixrt/", + "readme_file": "models/cv/classification/vit/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "https://www.image-net.org/download.php", + "download_url": "https://local/vit.onnx", + "need_third_part": true, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "DeiT-B", + "model_name": "deit_b", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "cv/classification", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/cv/classification/deit_b/ixrt/", + "readme_file": "models/cv/classification/deit_b/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "https://www.image-net.org/download.php", + "download_url": "https://local/deit_b.onnx", + "need_third_part": true, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "MobileNetV1", + "model_name": "mobilenet_v1", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "cv/classification", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/cv/classification/mobilenet_v1/ixrt/", + "readme_file": "models/cv/classification/mobilenet_v1/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "https://www.image-net.org/download.php", + "download_url": "https://local/mobilenet_v1.onnx", + "need_third_part": true, + "precisions": [ + "fp16", + "int8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "DBNet", + "model_name": "dbnet", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "cv/ocr", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/cv/ocr/dbnet/ixrt/", + "readme_file": "models/cv/ocr/dbnet/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "local/icdar2015", + "download_url": "http://local/dbnet.onnx", + "need_third_part": true, + "precisions": [ + "fp16", + "int8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "DDRNet", + "model_name": "ddrnet", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "cv/semantic_segmentation", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/cv/semantic_segmentation/ddrnet/ixrt/", + "readme_file": "models/cv/semantic_segmentation/ddrnet/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "local/cityscapes", + "download_url": "http://local/ddrnet.onnx", + "need_third_part": true, + "precisions": [ + "fp16", + "int8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "DeepSpeech2", + "model_name": "deepspeech2", + "framework": "ixrt", + "release_version": "26.06", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "speech/speech_recognition", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/speech/speech_recognition/deepspeech2/ixrt/", + "readme_file": "models/speech/speech_recognition/deepspeech2/ixrt/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "local/LibriSpeech", + "download_url": "http://local/deepspeech2.onnx", + "need_third_part": true, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py index 61a86fad7fbd82ced93b8fd005992e4d9c4c80fb..420ee2a6684248acce889d298dfe5d433fc06509 100644 --- a/tests/run_ixrt.py +++ b/tests/run_ixrt.py @@ -124,11 +124,11 @@ def main(): logging.info(f"End running {model['model_name']} test case.") # multi_object_tracking模型 - if model["category"] in ["cv/multi_object_tracking", "cv/semantic_segmentation", "cv/ocr", "multimodal/diffusion_model", "speech/speech_synthesis"]: + if model["category"] in ["cv/multi_object_tracking", "cv/semantic_segmentation", "cv/ocr", "multimodal/diffusion_model", "speech/speech_synthesis", "speech/speech_recognition"]: logging.info(f"Start running {model['model_name']} test case:\n{json.dumps(model, indent=4)}") d_url = model["download_url"] if d_url is not None: - result = run_multi_object_tracking_testcase(model) + result = run_multi_object_tracking_testcase(model, whl_url) check_model_result(result) logging.debug(f"The result of {model['model_name']} is\n{json.dumps(result, indent=4)}") logging.info(f"End running {model['model_name']} test case.") @@ -507,7 +507,7 @@ def run_segmentation_and_face_testcase(model): logging.debug(f"matchs:\n{matchs}") return result -def run_multi_object_tracking_testcase(model): +def run_multi_object_tracking_testcase(model, whl_url): model_name = model["model_name"] result = { "name": model_name, @@ -527,6 +527,11 @@ def run_multi_object_tracking_testcase(model): ls -l | grep onnx """ + if model_name == "deepspeech2": + prepare_script += f""" + pip install {whl_url}`curl -s {whl_url} | grep -o 'paddlepaddle-[^"]*\.whl' | head -n1` + """ + # add pip list info when in debug mode if utils.is_debug(): pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n" @@ -567,9 +572,10 @@ def run_multi_object_tracking_testcase(model): result["result"][prec] = result["result"][prec] | {m[0]: m[1], m[2]: m[3]} pattern = METRIC_PATTERN matchs = re.findall(pattern, sout) - if matchs and len(matchs) == 1: - result["result"][prec].update(get_metric_result(matchs[0])) - result["result"][prec]["status"] = "PASS" + if matchs: + for m in matchs: + result["result"][prec].update(get_metric_result(m)) + result["result"][prec]["status"] = "PASS" result["result"][prec]["Cost time (s)"] = t logging.debug(f"matchs:\n{matchs}") return result