diff --git a/frameworks/tensorflow/2.18.1/Dockerfile b/frameworks/tensorflow/2.18.1/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2e9abf9f57f54981f56e3a2e4ba1cf12a59ad9a2 --- /dev/null +++ b/frameworks/tensorflow/2.18.1/Dockerfile @@ -0,0 +1,19 @@ + +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="TensorFlow 2.18.1 (GPU) on OpenCloudOS 9" + +ENV NVIDIA_VISIBLE_DEVICES=all \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=120 + + +WORKDIR /home + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "tensorflow[and-cuda]==2.18.1" + +CMD ["python3"] \ No newline at end of file diff --git a/frameworks/tensorflow/2.18.1/README.md b/frameworks/tensorflow/2.18.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2c8ceffd85879e39a90fd324bda5ab5cfd5ec89b --- /dev/null +++ b/frameworks/tensorflow/2.18.1/README.md @@ -0,0 +1,31 @@ +# tensorflow 2.18.1 on OpenCloudOS 9 + +## 基本信息 +- **框架版本**:v2.18.1 +- **基础镜像**:opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**: 12.x 或 更高 + +## 构建 + +docker build -t oc9-tensorflow:2.18.1 . + +## 镜像启动命令 + +docker run -d --gpus all --name oc9-tensorflow oc9-tensorflow:2.18.1 + +## 镜像测试命令 + +docker run --rm --gpus all oc9-tensorflow:2.18.1 bash test.sh +docker run --rm --gpus all oc9-tensorflow:2.18.1 python tensor_smoke_test.py + +## 已知问题 +``` +对于如下日志输出,属于正常现象。这些日志通常不是致命错误。 +核心原因是:TensorFlow 在 import tensorflow as tf 时,会初始化 CUDA/XLA 相关组件,并注册 cuFFT、cuDNN、cuBLAS 等 GPU 插件工厂。 +日志里的意思是:这些插件工厂已经注册过一次,现在又尝试注册,所以打印了“already been registered”。 +TensorFlow 官方 GitHub 上有多个相同现象的 issue,日志内容和图片中的告警日志基本一致, +包括 Unable to register cuFFT factory、cuDNN factory、cuBLAS factory、computation placer already registered 等 +``` +如下图所示告警日志: +![告警日志](./warring_log.png) \ No newline at end of file diff --git a/frameworks/tensorflow/2.18.1/build.conf b/frameworks/tensorflow/2.18.1/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..b1f5736581ab05c9c6b15b8d9442e063afe25d8a --- /dev/null +++ b/frameworks/tensorflow/2.18.1/build.conf @@ -0,0 +1,4 @@ +# tensorflow 2.18.1 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-tensorflow +IMAGE_TAG=2.18.1 +GPU_TEST=false \ No newline at end of file diff --git a/frameworks/tensorflow/2.18.1/tensor_smoke_test.py b/frameworks/tensorflow/2.18.1/tensor_smoke_test.py new file mode 100644 index 0000000000000000000000000000000000000000..02cec134116518270c21bcf37daf410a619ca0a1 --- /dev/null +++ b/frameworks/tensorflow/2.18.1/tensor_smoke_test.py @@ -0,0 +1,52 @@ +import os +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +import numpy as np +import tensorflow as tf + + +def main(): + print("TensorFlow version:", tf.__version__) + + gpus = tf.config.list_physical_devices("GPU") + print("GPUs:", gpus) + + # 基础张量计算 + a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + c = tf.matmul(a, b) + + print("matmul result:") + print(c.numpy()) + + expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) + tf.debugging.assert_near(c, expected) + + # 简单 Keras 模型 + x = np.array([[0.0], [1.0], [2.0], [3.0]], dtype=np.float32) + y = np.array([[1.0], [3.0], [5.0], [7.0]], dtype=np.float32) + + model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(1), + ]) + + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", + ) + + history = model.fit(x, y, epochs=3, verbose=0) + + pred = model.predict(np.array([[4.0]], dtype=np.float32), verbose=0) + + assert pred.shape == (1, 1) + assert np.isfinite(pred).all() + + print("final loss:", float(history.history["loss"][-1])) + print("prediction:", float(pred[0][0])) + print("TensorFlow smoke test passed") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frameworks/tensorflow/2.18.1/test.sh b/frameworks/tensorflow/2.18.1/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..c2fed93f16a25af247974d29f8d80998f0fb4dcc --- /dev/null +++ b/frameworks/tensorflow/2.18.1/test.sh @@ -0,0 +1,336 @@ +#!/usr/bin/env bash +# 在容器外执行:验证指定 Docker 镜像中的 TensorFlow / Keras / CUDA 基础功能。 +# +# 用法: +# bash test.sh [额外 docker run 参数...] +# +# 示例: +# bash test.sh my-tensorflow:latest +# bash test.sh my-tensorflow:latest --ipc=host +# REQUIRE_CUDA=0 bash test.sh tensorflow-cpu:test +# EXPECTED_TF_VERSION_PREFIX=2.19 bash test.sh my-tensorflow:latest + +set -Eeuo pipefail + +IMAGE="${1:-}" + +if [[ -z "${IMAGE}" || "${IMAGE}" == "-h" || "${IMAGE}" == "--help" ]]; then + cat <<'USAGE' +用法: + bash test.sh [额外 docker run 参数...] + +环境变量: + REQUIRE_CUDA=1|0 是否强制要求 CUDA/GPU 可用,默认 1 + GPUS=all 传给 docker run --gpus 的值,默认 all + PYTHON_BIN=python3 容器内 Python 命令,默认 python3 + DOCKER_NETWORK=none docker 网络模式,默认 none + TIMEOUT_SECONDS=180 整体测试超时时间,默认 180 秒 + EXPECTED_TF_VERSION_PREFIX=2.19 可选,检查 TensorFlow 版本前缀 + +示例: + bash test.sh registry.example.com/ai/tensorflow:2.19-cu12 + REQUIRE_CUDA=0 bash test.sh tensorflow-cpu:test + EXPECTED_TF_VERSION_PREFIX=2.19 bash test.sh my-image:tag + bash test.sh my-image:tag --ipc=host --shm-size=2g +USAGE + exit 1 +fi + +shift || true + +REQUIRE_CUDA="${REQUIRE_CUDA:-1}" +GPUS="${GPUS:-all}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +DOCKER_NETWORK="${DOCKER_NETWORK:-none}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-180}" +EXPECTED_TF_VERSION_PREFIX="${EXPECTED_TF_VERSION_PREFIX:-}" +EXTRA_DOCKER_ARGS=("$@") + +log() { printf '\033[1;34m%s\033[0m\n' "$*"; } +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } + +[[ "${REQUIRE_CUDA}" =~ ^[01]$ ]] || fail "REQUIRE_CUDA 只能是 1 或 0,当前值: ${REQUIRE_CUDA}" + +command -v docker >/dev/null 2>&1 || fail "未找到 docker 命令" +docker info >/dev/null 2>&1 || fail "docker daemon 不可用,请确认 Docker 服务已启动且当前用户有权限访问" + +if ! docker image inspect "${IMAGE}" >/dev/null 2>&1; then + warn "本地未找到镜像 ${IMAGE};docker run 可能会尝试拉取镜像" +fi + +DOCKER_ARGS=(run --rm -i) + +if [[ "${REQUIRE_CUDA}" == "1" ]]; then + DOCKER_ARGS+=(--gpus "${GPUS}") +fi + +DOCKER_ARGS+=( + -e "REQUIRE_CUDA=${REQUIRE_CUDA}" + -e "PYTHON_BIN=${PYTHON_BIN}" + -e "EXPECTED_TF_VERSION_PREFIX=${EXPECTED_TF_VERSION_PREFIX}" + -e "TF_CPP_MIN_LOG_LEVEL=2" +) + +DOCKER_ARGS+=("${EXTRA_DOCKER_ARGS[@]}") +DOCKER_ARGS+=("${IMAGE}" /bin/bash -s) + +log "=== TensorFlow Docker 镜像功能测试 ===" +printf '镜像: %s\n' "${IMAGE}" +printf 'CUDA 强制检查: %s\n' "${REQUIRE_CUDA}" +printf 'Docker 网络: %s\n' "${DOCKER_NETWORK}" +printf 'Python 命令: %s\n' "${PYTHON_BIN}" + +if [[ -n "${EXPECTED_TF_VERSION_PREFIX}" ]]; then + printf '期望 TensorFlow 版本前缀: %s\n' "${EXPECTED_TF_VERSION_PREFIX}" +fi + +if ((${#EXTRA_DOCKER_ARGS[@]} > 0)); then + printf '额外 docker 参数: %s\n' "${EXTRA_DOCKER_ARGS[*]}" +fi + +RUN_CMD=(docker "${DOCKER_ARGS[@]}") +echo "##############################################" +echo "${RUN_CMD} ${DOCKER_ARGS[@]}" +echo "##############################################" + +if command -v timeout >/dev/null 2>&1; then + RUN_CMD=(timeout --preserve-status "${TIMEOUT_SECONDS}s" "${RUN_CMD[@]}") +fi + +"${RUN_CMD[@]}" <<'IN_CONTAINER' +set -Eeuo pipefail + +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } +section() { printf '\n\033[1;34m=== %s ===\033[0m\n' "$*"; } + +PY="${PYTHON_BIN:-python3}" + +if [[ -d /usr/local/cuda ]]; then + export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + export PATH="${CUDA_HOME}/bin:${PATH}" + export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +fi + +export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" + +section "1. Python 基础检查" + +command -v "${PY}" >/dev/null 2>&1 || fail "容器内未找到 Python 命令: ${PY}" +"${PY}" --version +ok "Python 可用" + +section "2. TensorFlow 导入与版本检查" + +"${PY}" - <<'PY' +import os +import platform +import sys + +import tensorflow as tf + +print("python_executable:", sys.executable) +print("platform:", platform.platform()) +print("tensorflow_version:", tf.__version__) + +expected = os.environ.get("EXPECTED_TF_VERSION_PREFIX", "").strip() +if expected: + assert tf.__version__.startswith(expected), ( + f"期望 TensorFlow 版本前缀 {expected},实际版本 {tf.__version__}" + ) + print("expected_version_prefix:", expected) +PY + +ok "TensorFlow import 正常" + +section "3. TensorFlow 构建信息检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +print("tf_version:", tf.__version__) + +try: + build_info = tf.sysconfig.get_build_info() + print("build_info:") + for k in sorted(build_info.keys()): + print(f" {k}: {build_info[k]}") +except Exception as e: + print("无法读取 tf.sysconfig.get_build_info():", repr(e)) + +try: + print("is_built_with_cuda:", tf.test.is_built_with_cuda()) +except Exception as e: + print("无法读取 tf.test.is_built_with_cuda():", repr(e)) +PY + +ok "TensorFlow 构建信息读取完成" + +section "4. CUDA / GPU 环境检查" + +if [[ "${REQUIRE_CUDA:-1}" == "1" ]]; then + command -v nvidia-smi >/dev/null 2>&1 || fail "nvidia-smi 不可用;请检查宿主机 NVIDIA 驱动、nvidia-container-toolkit、docker run --gpus 参数" + + echo "nvidia-smi 摘要:" + nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || fail "nvidia-smi 执行失败" + + "${PY}" - <<'PY' +import tensorflow as tf + +gpus = tf.config.list_physical_devices("GPU") +logical_gpus = tf.config.list_logical_devices("GPU") + +print("physical_gpus:", gpus) +print("logical_gpus:", logical_gpus) + +assert len(gpus) > 0, "TensorFlow 未检测到物理 GPU" +assert len(logical_gpus) > 0, "TensorFlow 未检测到逻辑 GPU" + +for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + print(f"memory_growth_enabled: {gpu}") + except Exception as e: + print(f"memory_growth_skip: {gpu}, reason={e!r}") +PY + + ok "TensorFlow GPU 识别正常" + + section "5. TensorFlow GPU 实际计算检查" + + "${PY}" - <<'PY' +import tensorflow as tf + +with tf.device("/GPU:0"): + a = tf.random.normal((1024, 1024)) + b = tf.random.normal((1024, 1024)) + c = tf.matmul(a, b) + +result = c.numpy() + +assert result.shape == (1024, 1024) +assert not tf.math.reduce_any(tf.math.is_nan(c)).numpy(), "GPU 计算结果包含 NaN" + +print("gpu_matmul_shape:", result.shape) +print("gpu_matmul_mean:", float(tf.reduce_mean(c).numpy())) +PY + + ok "TensorFlow GPU 计算正常" + + if command -v nvcc >/dev/null 2>&1; then + echo "nvcc 版本:" + nvcc --version | sed -n '1,5p' + else + warn "未检测到 nvcc:运行时镜像通常不需要 nvcc,只有编译 CUDA 扩展时才需要" + fi +else + warn "REQUIRE_CUDA=0,跳过强制 CUDA / GPU 检查" +fi + +section "6. TensorFlow CPU 基础张量计算" + +"${PY}" - <<'PY' +import tensorflow as tf + +a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + +c = tf.matmul(a, b) + +expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) +tf.debugging.assert_near(c, expected) + +print("matmul_result:") +print(c.numpy()) +PY + +ok "TensorFlow 基础张量计算正常" + +section "7. Keras 模型构建、训练、推理检查" + +"${PY}" - <<'PY' +import os +import tempfile + +import numpy as np +import tensorflow as tf + +np.random.seed(123) +tf.random.set_seed(123) + +# 构造简单训练数据:y = 2x + 1 +x = np.array([[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32) +y = np.array([[1.0], [3.0], [5.0], [7.0], [9.0], [11.0]], dtype=np.float32) + +model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(8, activation="relu"), + tf.keras.layers.Dense(1), +]) + +model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", +) + +history = model.fit(x, y, epochs=5, batch_size=2, verbose=0) + +pred = model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + +assert pred.shape == (1, 1), f"推理结果 shape 异常:{pred.shape}" +assert np.isfinite(pred).all(), "推理结果包含 NaN 或 Inf" + +print("final_loss:", float(history.history["loss"][-1])) +print("prediction_shape:", pred.shape) +print("prediction_value:", float(pred[0][0])) + +with tempfile.TemporaryDirectory() as tmp_dir: + keras_path = os.path.join(tmp_dir, "tiny_model.keras") + model.save(keras_path) + + loaded_model = tf.keras.models.load_model(keras_path) + loaded_pred = loaded_model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + + assert loaded_pred.shape == (1, 1), f"加载模型推理 shape 异常:{loaded_pred.shape}" + assert np.isfinite(loaded_pred).all(), "加载模型推理结果包含 NaN 或 Inf" + + print("keras_save_load: ok") +PY + +ok "Keras 模型训练、推理、保存、加载正常" + +section "8. tf.data 数据管道检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +dataset = tf.data.Dataset.from_tensor_slices( + ( + tf.constant([[1.0], [2.0], [3.0], [4.0]]), + tf.constant([[2.0], [4.0], [6.0], [8.0]]), + ) +) + +dataset = dataset.batch(2).prefetch(tf.data.AUTOTUNE) + +batch_count = 0 +for xb, yb in dataset: + assert xb.shape[0] <= 2 + assert yb.shape[0] <= 2 + batch_count += 1 + +assert batch_count == 2, f"batch 数量异常: {batch_count}" + +print("tf_data_batch_count:", batch_count) +PY + +ok "tf.data 数据管道正常" + +section "测试结果" +ok "所有 TensorFlow 检查通过" +IN_CONTAINER + +ok "宿主机侧 docker run 验证完成" \ No newline at end of file diff --git a/frameworks/tensorflow/2.18.1/warring_log.png b/frameworks/tensorflow/2.18.1/warring_log.png new file mode 100644 index 0000000000000000000000000000000000000000..1461c0cedad8d45c8d32554de19c341ba88d37bc Binary files /dev/null and b/frameworks/tensorflow/2.18.1/warring_log.png differ diff --git a/frameworks/tensorflow/2.19.0/Dockerfile b/frameworks/tensorflow/2.19.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..4e44e6d6fbac6897481074fd86c5caa7291f83fd --- /dev/null +++ b/frameworks/tensorflow/2.19.0/Dockerfile @@ -0,0 +1,19 @@ + +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="TensorFlow 2.19.0 (GPU) on OpenCloudOS 9" + +ENV NVIDIA_VISIBLE_DEVICES=all \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=120 + + +WORKDIR /home + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "tensorflow[and-cuda]==2.19.0" + +CMD ["python3"] \ No newline at end of file diff --git a/frameworks/tensorflow/2.19.0/README.md b/frameworks/tensorflow/2.19.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b2cb9370da93241d386b2c14cc4d3c4ac2798946 --- /dev/null +++ b/frameworks/tensorflow/2.19.0/README.md @@ -0,0 +1,31 @@ +# tensorflow 2.19.0 on OpenCloudOS 9 + +## 基本信息 +- **框架版本**:v2.19.0 +- **基础镜像**:opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**: 12.x 或 更高 + +## 构建 + +docker build -t oc9-tensorflow:2.19.0 . + +## 镜像启动命令 + +docker run -d --gpus all --name oc9-tensorflow oc9-tensorflow:2.19.0 + +## 镜像测试命令 + +docker run --rm --gpus all oc9-tensorflow:2.19.0 bash test.sh +docker run --rm --gpus all oc9-tensorflow:2.19.0 python tensor_smoke_test.py + +## 已知问题 +``` +对于如下日志输出,属于正常现象。这些日志通常不是致命错误。 +核心原因是:TensorFlow 在 import tensorflow as tf 时,会初始化 CUDA/XLA 相关组件,并注册 cuFFT、cuDNN、cuBLAS 等 GPU 插件工厂。 +日志里的意思是:这些插件工厂已经注册过一次,现在又尝试注册,所以打印了“already been registered”。 +TensorFlow 官方 GitHub 上有多个相同现象的 issue,日志内容和图片中的告警日志基本一致, +包括 Unable to register cuFFT factory、cuDNN factory、cuBLAS factory、computation placer already registered 等 +``` +如下图所示告警日志: +![告警日志](./warring_log.png) \ No newline at end of file diff --git a/frameworks/tensorflow/2.19.0/build.conf b/frameworks/tensorflow/2.19.0/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..e0a7020614dd0eaf103411ab1c481950f6592219 --- /dev/null +++ b/frameworks/tensorflow/2.19.0/build.conf @@ -0,0 +1,4 @@ +# tensorflow 2.19.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-tensorflow +IMAGE_TAG=2.19.0 +GPU_TEST=false \ No newline at end of file diff --git a/frameworks/tensorflow/2.19.0/tensor_smoke_test.py b/frameworks/tensorflow/2.19.0/tensor_smoke_test.py new file mode 100644 index 0000000000000000000000000000000000000000..02cec134116518270c21bcf37daf410a619ca0a1 --- /dev/null +++ b/frameworks/tensorflow/2.19.0/tensor_smoke_test.py @@ -0,0 +1,52 @@ +import os +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +import numpy as np +import tensorflow as tf + + +def main(): + print("TensorFlow version:", tf.__version__) + + gpus = tf.config.list_physical_devices("GPU") + print("GPUs:", gpus) + + # 基础张量计算 + a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + c = tf.matmul(a, b) + + print("matmul result:") + print(c.numpy()) + + expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) + tf.debugging.assert_near(c, expected) + + # 简单 Keras 模型 + x = np.array([[0.0], [1.0], [2.0], [3.0]], dtype=np.float32) + y = np.array([[1.0], [3.0], [5.0], [7.0]], dtype=np.float32) + + model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(1), + ]) + + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", + ) + + history = model.fit(x, y, epochs=3, verbose=0) + + pred = model.predict(np.array([[4.0]], dtype=np.float32), verbose=0) + + assert pred.shape == (1, 1) + assert np.isfinite(pred).all() + + print("final loss:", float(history.history["loss"][-1])) + print("prediction:", float(pred[0][0])) + print("TensorFlow smoke test passed") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frameworks/tensorflow/2.19.0/test.sh b/frameworks/tensorflow/2.19.0/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..cbbce1b29c54efd794c0a4fe5b99b4fc028e41fa --- /dev/null +++ b/frameworks/tensorflow/2.19.0/test.sh @@ -0,0 +1,336 @@ +#!/usr/bin/env bash +# 在容器外执行:验证指定 Docker 镜像中的 TensorFlow / Keras / CUDA 基础功能。 +# +# 用法: +# bash test_tensorflow_docker.sh [额外 docker run 参数...] +# +# 示例: +# bash test_tensorflow_docker.sh my-tensorflow:latest +# bash test_tensorflow_docker.sh my-tensorflow:latest --ipc=host +# REQUIRE_CUDA=0 bash test_tensorflow_docker.sh tensorflow-cpu:test +# EXPECTED_TF_VERSION_PREFIX=2.19 bash test_tensorflow_docker.sh my-tensorflow:latest + +set -Eeuo pipefail + +IMAGE="${1:-}" + +if [[ -z "${IMAGE}" || "${IMAGE}" == "-h" || "${IMAGE}" == "--help" ]]; then + cat <<'USAGE' +用法: + bash test.sh [额外 docker run 参数...] + +环境变量: + REQUIRE_CUDA=1|0 是否强制要求 CUDA/GPU 可用,默认 1 + GPUS=all 传给 docker run --gpus 的值,默认 all + PYTHON_BIN=python3 容器内 Python 命令,默认 python3 + DOCKER_NETWORK=none docker 网络模式,默认 none + TIMEOUT_SECONDS=180 整体测试超时时间,默认 180 秒 + EXPECTED_TF_VERSION_PREFIX=2.19 可选,检查 TensorFlow 版本前缀 + +示例: + bash test.sh registry.example.com/ai/tensorflow:2.19-cu12 + REQUIRE_CUDA=0 bash test.sh tensorflow-cpu:test + EXPECTED_TF_VERSION_PREFIX=2.19 bash test.sh my-image:tag + bash test.sh my-image:tag --ipc=host --shm-size=2g +USAGE + exit 1 +fi + +shift || true + +REQUIRE_CUDA="${REQUIRE_CUDA:-1}" +GPUS="${GPUS:-all}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +DOCKER_NETWORK="${DOCKER_NETWORK:-none}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-180}" +EXPECTED_TF_VERSION_PREFIX="${EXPECTED_TF_VERSION_PREFIX:-}" +EXTRA_DOCKER_ARGS=("$@") + +log() { printf '\033[1;34m%s\033[0m\n' "$*"; } +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } + +[[ "${REQUIRE_CUDA}" =~ ^[01]$ ]] || fail "REQUIRE_CUDA 只能是 1 或 0,当前值: ${REQUIRE_CUDA}" + +command -v docker >/dev/null 2>&1 || fail "未找到 docker 命令" +docker info >/dev/null 2>&1 || fail "docker daemon 不可用,请确认 Docker 服务已启动且当前用户有权限访问" + +if ! docker image inspect "${IMAGE}" >/dev/null 2>&1; then + warn "本地未找到镜像 ${IMAGE};docker run 可能会尝试拉取镜像" +fi + +DOCKER_ARGS=(run --rm -i) + +if [[ "${REQUIRE_CUDA}" == "1" ]]; then + DOCKER_ARGS+=(--gpus "${GPUS}") +fi + +DOCKER_ARGS+=( + -e "REQUIRE_CUDA=${REQUIRE_CUDA}" + -e "PYTHON_BIN=${PYTHON_BIN}" + -e "EXPECTED_TF_VERSION_PREFIX=${EXPECTED_TF_VERSION_PREFIX}" + -e "TF_CPP_MIN_LOG_LEVEL=2" +) + +DOCKER_ARGS+=("${EXTRA_DOCKER_ARGS[@]}") +DOCKER_ARGS+=("${IMAGE}" /bin/bash -s) + +log "=== TensorFlow Docker 镜像功能测试 ===" +printf '镜像: %s\n' "${IMAGE}" +printf 'CUDA 强制检查: %s\n' "${REQUIRE_CUDA}" +printf 'Docker 网络: %s\n' "${DOCKER_NETWORK}" +printf 'Python 命令: %s\n' "${PYTHON_BIN}" + +if [[ -n "${EXPECTED_TF_VERSION_PREFIX}" ]]; then + printf '期望 TensorFlow 版本前缀: %s\n' "${EXPECTED_TF_VERSION_PREFIX}" +fi + +if ((${#EXTRA_DOCKER_ARGS[@]} > 0)); then + printf '额外 docker 参数: %s\n' "${EXTRA_DOCKER_ARGS[*]}" +fi + +RUN_CMD=(docker "${DOCKER_ARGS[@]}") +echo "##############################################" +echo "${RUN_CMD} ${DOCKER_ARGS[@]}" +echo "##############################################" + +if command -v timeout >/dev/null 2>&1; then + RUN_CMD=(timeout --preserve-status "${TIMEOUT_SECONDS}s" "${RUN_CMD[@]}") +fi + +"${RUN_CMD[@]}" <<'IN_CONTAINER' +set -Eeuo pipefail + +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } +section() { printf '\n\033[1;34m=== %s ===\033[0m\n' "$*"; } + +PY="${PYTHON_BIN:-python3}" + +if [[ -d /usr/local/cuda ]]; then + export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + export PATH="${CUDA_HOME}/bin:${PATH}" + export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +fi + +export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" + +section "1. Python 基础检查" + +command -v "${PY}" >/dev/null 2>&1 || fail "容器内未找到 Python 命令: ${PY}" +"${PY}" --version +ok "Python 可用" + +section "2. TensorFlow 导入与版本检查" + +"${PY}" - <<'PY' +import os +import platform +import sys + +import tensorflow as tf + +print("python_executable:", sys.executable) +print("platform:", platform.platform()) +print("tensorflow_version:", tf.__version__) + +expected = os.environ.get("EXPECTED_TF_VERSION_PREFIX", "").strip() +if expected: + assert tf.__version__.startswith(expected), ( + f"期望 TensorFlow 版本前缀 {expected},实际版本 {tf.__version__}" + ) + print("expected_version_prefix:", expected) +PY + +ok "TensorFlow import 正常" + +section "3. TensorFlow 构建信息检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +print("tf_version:", tf.__version__) + +try: + build_info = tf.sysconfig.get_build_info() + print("build_info:") + for k in sorted(build_info.keys()): + print(f" {k}: {build_info[k]}") +except Exception as e: + print("无法读取 tf.sysconfig.get_build_info():", repr(e)) + +try: + print("is_built_with_cuda:", tf.test.is_built_with_cuda()) +except Exception as e: + print("无法读取 tf.test.is_built_with_cuda():", repr(e)) +PY + +ok "TensorFlow 构建信息读取完成" + +section "4. CUDA / GPU 环境检查" + +if [[ "${REQUIRE_CUDA:-1}" == "1" ]]; then + command -v nvidia-smi >/dev/null 2>&1 || fail "nvidia-smi 不可用;请检查宿主机 NVIDIA 驱动、nvidia-container-toolkit、docker run --gpus 参数" + + echo "nvidia-smi 摘要:" + nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || fail "nvidia-smi 执行失败" + + "${PY}" - <<'PY' +import tensorflow as tf + +gpus = tf.config.list_physical_devices("GPU") +logical_gpus = tf.config.list_logical_devices("GPU") + +print("physical_gpus:", gpus) +print("logical_gpus:", logical_gpus) + +assert len(gpus) > 0, "TensorFlow 未检测到物理 GPU" +assert len(logical_gpus) > 0, "TensorFlow 未检测到逻辑 GPU" + +for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + print(f"memory_growth_enabled: {gpu}") + except Exception as e: + print(f"memory_growth_skip: {gpu}, reason={e!r}") +PY + + ok "TensorFlow GPU 识别正常" + + section "5. TensorFlow GPU 实际计算检查" + + "${PY}" - <<'PY' +import tensorflow as tf + +with tf.device("/GPU:0"): + a = tf.random.normal((1024, 1024)) + b = tf.random.normal((1024, 1024)) + c = tf.matmul(a, b) + +result = c.numpy() + +assert result.shape == (1024, 1024) +assert not tf.math.reduce_any(tf.math.is_nan(c)).numpy(), "GPU 计算结果包含 NaN" + +print("gpu_matmul_shape:", result.shape) +print("gpu_matmul_mean:", float(tf.reduce_mean(c).numpy())) +PY + + ok "TensorFlow GPU 计算正常" + + if command -v nvcc >/dev/null 2>&1; then + echo "nvcc 版本:" + nvcc --version | sed -n '1,5p' + else + warn "未检测到 nvcc:运行时镜像通常不需要 nvcc,只有编译 CUDA 扩展时才需要" + fi +else + warn "REQUIRE_CUDA=0,跳过强制 CUDA / GPU 检查" +fi + +section "6. TensorFlow CPU 基础张量计算" + +"${PY}" - <<'PY' +import tensorflow as tf + +a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + +c = tf.matmul(a, b) + +expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) +tf.debugging.assert_near(c, expected) + +print("matmul_result:") +print(c.numpy()) +PY + +ok "TensorFlow 基础张量计算正常" + +section "7. Keras 模型构建、训练、推理检查" + +"${PY}" - <<'PY' +import os +import tempfile + +import numpy as np +import tensorflow as tf + +np.random.seed(123) +tf.random.set_seed(123) + +# 构造简单训练数据:y = 2x + 1 +x = np.array([[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32) +y = np.array([[1.0], [3.0], [5.0], [7.0], [9.0], [11.0]], dtype=np.float32) + +model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(8, activation="relu"), + tf.keras.layers.Dense(1), +]) + +model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", +) + +history = model.fit(x, y, epochs=5, batch_size=2, verbose=0) + +pred = model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + +assert pred.shape == (1, 1), f"推理结果 shape 异常:{pred.shape}" +assert np.isfinite(pred).all(), "推理结果包含 NaN 或 Inf" + +print("final_loss:", float(history.history["loss"][-1])) +print("prediction_shape:", pred.shape) +print("prediction_value:", float(pred[0][0])) + +with tempfile.TemporaryDirectory() as tmp_dir: + keras_path = os.path.join(tmp_dir, "tiny_model.keras") + model.save(keras_path) + + loaded_model = tf.keras.models.load_model(keras_path) + loaded_pred = loaded_model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + + assert loaded_pred.shape == (1, 1), f"加载模型推理 shape 异常:{loaded_pred.shape}" + assert np.isfinite(loaded_pred).all(), "加载模型推理结果包含 NaN 或 Inf" + + print("keras_save_load: ok") +PY + +ok "Keras 模型训练、推理、保存、加载正常" + +section "8. tf.data 数据管道检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +dataset = tf.data.Dataset.from_tensor_slices( + ( + tf.constant([[1.0], [2.0], [3.0], [4.0]]), + tf.constant([[2.0], [4.0], [6.0], [8.0]]), + ) +) + +dataset = dataset.batch(2).prefetch(tf.data.AUTOTUNE) + +batch_count = 0 +for xb, yb in dataset: + assert xb.shape[0] <= 2 + assert yb.shape[0] <= 2 + batch_count += 1 + +assert batch_count == 2, f"batch 数量异常: {batch_count}" + +print("tf_data_batch_count:", batch_count) +PY + +ok "tf.data 数据管道正常" + +section "测试结果" +ok "所有 TensorFlow 检查通过" +IN_CONTAINER + +ok "宿主机侧 docker run 验证完成" \ No newline at end of file diff --git a/frameworks/tensorflow/2.19.0/test_result.png b/frameworks/tensorflow/2.19.0/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..48f15883c5c9d8cc6e2e22cd6ede0169048d2c0f Binary files /dev/null and b/frameworks/tensorflow/2.19.0/test_result.png differ diff --git a/frameworks/tensorflow/2.19.0/warring_log.png b/frameworks/tensorflow/2.19.0/warring_log.png new file mode 100644 index 0000000000000000000000000000000000000000..1461c0cedad8d45c8d32554de19c341ba88d37bc Binary files /dev/null and b/frameworks/tensorflow/2.19.0/warring_log.png differ diff --git a/frameworks/tensorflow/2.21.0/Dockerfile b/frameworks/tensorflow/2.21.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..8def434782612f47c4349ee88d0a5adc24c81872 --- /dev/null +++ b/frameworks/tensorflow/2.21.0/Dockerfile @@ -0,0 +1,19 @@ + +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="TensorFlow 2.21.0 (GPU) on OpenCloudOS 9" + +ENV NVIDIA_VISIBLE_DEVICES=all \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=120 + + +WORKDIR /home + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "tensorflow[and-cuda]==2.21.0" + +CMD ["python3"] \ No newline at end of file diff --git a/frameworks/tensorflow/2.21.0/README.md b/frameworks/tensorflow/2.21.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..36bc11abcc366fe05d8bb2f853fa83f391cf676a --- /dev/null +++ b/frameworks/tensorflow/2.21.0/README.md @@ -0,0 +1,31 @@ +# tensorflow 2.21.0 on OpenCloudOS 9 + +## 基本信息 +- **框架版本**:v2.21.0 +- **基础镜像**:opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**: 12.x 或 更高 + +## 构建 + +docker build -t oc9-tensorflow:2.21.0 . + +## 镜像启动命令 + +docker run -d --gpus all --name oc9-tensorflow oc9-tensorflow:2.21.0 + +## 镜像测试命令 + +docker run --rm --gpus all oc9-tensorflow:2.21.0 bash test.sh +docker run --rm --gpus all oc9-tensorflow:2.21.0 python tensor_smoke_test.py + +## 已知问题 +``` +对于如下日志输出,属于正常现象。这些日志通常不是致命错误。 +核心原因是:TensorFlow 在 import tensorflow as tf 时,会初始化 CUDA/XLA 相关组件,并注册 cuFFT、cuDNN、cuBLAS 等 GPU 插件工厂。 +日志里的意思是:这些插件工厂已经注册过一次,现在又尝试注册,所以打印了“already been registered”。 +TensorFlow 官方 GitHub 上有多个相同现象的 issue,日志内容和图片中的告警日志基本一致, +包括 Unable to register cuFFT factory、cuDNN factory、cuBLAS factory、computation placer already registered 等 +``` +如下图所示告警日志: +![告警日志](./warring_log.png) \ No newline at end of file diff --git a/frameworks/tensorflow/2.21.0/build.conf b/frameworks/tensorflow/2.21.0/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..b0255949f5a755fb67ece75bc5eeb3900eddb169 --- /dev/null +++ b/frameworks/tensorflow/2.21.0/build.conf @@ -0,0 +1,4 @@ +# tensorflow 2.21.0 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-tensorflow +IMAGE_TAG=2.21.0 +GPU_TEST=false \ No newline at end of file diff --git a/frameworks/tensorflow/2.21.0/tensor_smoke_test.py b/frameworks/tensorflow/2.21.0/tensor_smoke_test.py new file mode 100644 index 0000000000000000000000000000000000000000..02cec134116518270c21bcf37daf410a619ca0a1 --- /dev/null +++ b/frameworks/tensorflow/2.21.0/tensor_smoke_test.py @@ -0,0 +1,52 @@ +import os +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +import numpy as np +import tensorflow as tf + + +def main(): + print("TensorFlow version:", tf.__version__) + + gpus = tf.config.list_physical_devices("GPU") + print("GPUs:", gpus) + + # 基础张量计算 + a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + c = tf.matmul(a, b) + + print("matmul result:") + print(c.numpy()) + + expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) + tf.debugging.assert_near(c, expected) + + # 简单 Keras 模型 + x = np.array([[0.0], [1.0], [2.0], [3.0]], dtype=np.float32) + y = np.array([[1.0], [3.0], [5.0], [7.0]], dtype=np.float32) + + model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(1), + ]) + + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", + ) + + history = model.fit(x, y, epochs=3, verbose=0) + + pred = model.predict(np.array([[4.0]], dtype=np.float32), verbose=0) + + assert pred.shape == (1, 1) + assert np.isfinite(pred).all() + + print("final loss:", float(history.history["loss"][-1])) + print("prediction:", float(pred[0][0])) + print("TensorFlow smoke test passed") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frameworks/tensorflow/2.21.0/test.sh b/frameworks/tensorflow/2.21.0/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..c2fed93f16a25af247974d29f8d80998f0fb4dcc --- /dev/null +++ b/frameworks/tensorflow/2.21.0/test.sh @@ -0,0 +1,336 @@ +#!/usr/bin/env bash +# 在容器外执行:验证指定 Docker 镜像中的 TensorFlow / Keras / CUDA 基础功能。 +# +# 用法: +# bash test.sh [额外 docker run 参数...] +# +# 示例: +# bash test.sh my-tensorflow:latest +# bash test.sh my-tensorflow:latest --ipc=host +# REQUIRE_CUDA=0 bash test.sh tensorflow-cpu:test +# EXPECTED_TF_VERSION_PREFIX=2.19 bash test.sh my-tensorflow:latest + +set -Eeuo pipefail + +IMAGE="${1:-}" + +if [[ -z "${IMAGE}" || "${IMAGE}" == "-h" || "${IMAGE}" == "--help" ]]; then + cat <<'USAGE' +用法: + bash test.sh [额外 docker run 参数...] + +环境变量: + REQUIRE_CUDA=1|0 是否强制要求 CUDA/GPU 可用,默认 1 + GPUS=all 传给 docker run --gpus 的值,默认 all + PYTHON_BIN=python3 容器内 Python 命令,默认 python3 + DOCKER_NETWORK=none docker 网络模式,默认 none + TIMEOUT_SECONDS=180 整体测试超时时间,默认 180 秒 + EXPECTED_TF_VERSION_PREFIX=2.19 可选,检查 TensorFlow 版本前缀 + +示例: + bash test.sh registry.example.com/ai/tensorflow:2.19-cu12 + REQUIRE_CUDA=0 bash test.sh tensorflow-cpu:test + EXPECTED_TF_VERSION_PREFIX=2.19 bash test.sh my-image:tag + bash test.sh my-image:tag --ipc=host --shm-size=2g +USAGE + exit 1 +fi + +shift || true + +REQUIRE_CUDA="${REQUIRE_CUDA:-1}" +GPUS="${GPUS:-all}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +DOCKER_NETWORK="${DOCKER_NETWORK:-none}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-180}" +EXPECTED_TF_VERSION_PREFIX="${EXPECTED_TF_VERSION_PREFIX:-}" +EXTRA_DOCKER_ARGS=("$@") + +log() { printf '\033[1;34m%s\033[0m\n' "$*"; } +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } + +[[ "${REQUIRE_CUDA}" =~ ^[01]$ ]] || fail "REQUIRE_CUDA 只能是 1 或 0,当前值: ${REQUIRE_CUDA}" + +command -v docker >/dev/null 2>&1 || fail "未找到 docker 命令" +docker info >/dev/null 2>&1 || fail "docker daemon 不可用,请确认 Docker 服务已启动且当前用户有权限访问" + +if ! docker image inspect "${IMAGE}" >/dev/null 2>&1; then + warn "本地未找到镜像 ${IMAGE};docker run 可能会尝试拉取镜像" +fi + +DOCKER_ARGS=(run --rm -i) + +if [[ "${REQUIRE_CUDA}" == "1" ]]; then + DOCKER_ARGS+=(--gpus "${GPUS}") +fi + +DOCKER_ARGS+=( + -e "REQUIRE_CUDA=${REQUIRE_CUDA}" + -e "PYTHON_BIN=${PYTHON_BIN}" + -e "EXPECTED_TF_VERSION_PREFIX=${EXPECTED_TF_VERSION_PREFIX}" + -e "TF_CPP_MIN_LOG_LEVEL=2" +) + +DOCKER_ARGS+=("${EXTRA_DOCKER_ARGS[@]}") +DOCKER_ARGS+=("${IMAGE}" /bin/bash -s) + +log "=== TensorFlow Docker 镜像功能测试 ===" +printf '镜像: %s\n' "${IMAGE}" +printf 'CUDA 强制检查: %s\n' "${REQUIRE_CUDA}" +printf 'Docker 网络: %s\n' "${DOCKER_NETWORK}" +printf 'Python 命令: %s\n' "${PYTHON_BIN}" + +if [[ -n "${EXPECTED_TF_VERSION_PREFIX}" ]]; then + printf '期望 TensorFlow 版本前缀: %s\n' "${EXPECTED_TF_VERSION_PREFIX}" +fi + +if ((${#EXTRA_DOCKER_ARGS[@]} > 0)); then + printf '额外 docker 参数: %s\n' "${EXTRA_DOCKER_ARGS[*]}" +fi + +RUN_CMD=(docker "${DOCKER_ARGS[@]}") +echo "##############################################" +echo "${RUN_CMD} ${DOCKER_ARGS[@]}" +echo "##############################################" + +if command -v timeout >/dev/null 2>&1; then + RUN_CMD=(timeout --preserve-status "${TIMEOUT_SECONDS}s" "${RUN_CMD[@]}") +fi + +"${RUN_CMD[@]}" <<'IN_CONTAINER' +set -Eeuo pipefail + +ok() { printf '\033[1;32m✓ %s\033[0m\n' "$*"; } +warn() { printf '\033[1;33m! %s\033[0m\n' "$*"; } +fail() { printf '\033[1;31m✗ %s\033[0m\n' "$*" >&2; exit 1; } +section() { printf '\n\033[1;34m=== %s ===\033[0m\n' "$*"; } + +PY="${PYTHON_BIN:-python3}" + +if [[ -d /usr/local/cuda ]]; then + export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + export PATH="${CUDA_HOME}/bin:${PATH}" + export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +fi + +export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" + +section "1. Python 基础检查" + +command -v "${PY}" >/dev/null 2>&1 || fail "容器内未找到 Python 命令: ${PY}" +"${PY}" --version +ok "Python 可用" + +section "2. TensorFlow 导入与版本检查" + +"${PY}" - <<'PY' +import os +import platform +import sys + +import tensorflow as tf + +print("python_executable:", sys.executable) +print("platform:", platform.platform()) +print("tensorflow_version:", tf.__version__) + +expected = os.environ.get("EXPECTED_TF_VERSION_PREFIX", "").strip() +if expected: + assert tf.__version__.startswith(expected), ( + f"期望 TensorFlow 版本前缀 {expected},实际版本 {tf.__version__}" + ) + print("expected_version_prefix:", expected) +PY + +ok "TensorFlow import 正常" + +section "3. TensorFlow 构建信息检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +print("tf_version:", tf.__version__) + +try: + build_info = tf.sysconfig.get_build_info() + print("build_info:") + for k in sorted(build_info.keys()): + print(f" {k}: {build_info[k]}") +except Exception as e: + print("无法读取 tf.sysconfig.get_build_info():", repr(e)) + +try: + print("is_built_with_cuda:", tf.test.is_built_with_cuda()) +except Exception as e: + print("无法读取 tf.test.is_built_with_cuda():", repr(e)) +PY + +ok "TensorFlow 构建信息读取完成" + +section "4. CUDA / GPU 环境检查" + +if [[ "${REQUIRE_CUDA:-1}" == "1" ]]; then + command -v nvidia-smi >/dev/null 2>&1 || fail "nvidia-smi 不可用;请检查宿主机 NVIDIA 驱动、nvidia-container-toolkit、docker run --gpus 参数" + + echo "nvidia-smi 摘要:" + nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader || fail "nvidia-smi 执行失败" + + "${PY}" - <<'PY' +import tensorflow as tf + +gpus = tf.config.list_physical_devices("GPU") +logical_gpus = tf.config.list_logical_devices("GPU") + +print("physical_gpus:", gpus) +print("logical_gpus:", logical_gpus) + +assert len(gpus) > 0, "TensorFlow 未检测到物理 GPU" +assert len(logical_gpus) > 0, "TensorFlow 未检测到逻辑 GPU" + +for gpu in gpus: + try: + tf.config.experimental.set_memory_growth(gpu, True) + print(f"memory_growth_enabled: {gpu}") + except Exception as e: + print(f"memory_growth_skip: {gpu}, reason={e!r}") +PY + + ok "TensorFlow GPU 识别正常" + + section "5. TensorFlow GPU 实际计算检查" + + "${PY}" - <<'PY' +import tensorflow as tf + +with tf.device("/GPU:0"): + a = tf.random.normal((1024, 1024)) + b = tf.random.normal((1024, 1024)) + c = tf.matmul(a, b) + +result = c.numpy() + +assert result.shape == (1024, 1024) +assert not tf.math.reduce_any(tf.math.is_nan(c)).numpy(), "GPU 计算结果包含 NaN" + +print("gpu_matmul_shape:", result.shape) +print("gpu_matmul_mean:", float(tf.reduce_mean(c).numpy())) +PY + + ok "TensorFlow GPU 计算正常" + + if command -v nvcc >/dev/null 2>&1; then + echo "nvcc 版本:" + nvcc --version | sed -n '1,5p' + else + warn "未检测到 nvcc:运行时镜像通常不需要 nvcc,只有编译 CUDA 扩展时才需要" + fi +else + warn "REQUIRE_CUDA=0,跳过强制 CUDA / GPU 检查" +fi + +section "6. TensorFlow CPU 基础张量计算" + +"${PY}" - <<'PY' +import tensorflow as tf + +a = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +b = tf.constant([[5.0, 6.0], [7.0, 8.0]]) + +c = tf.matmul(a, b) + +expected = tf.constant([[19.0, 22.0], [43.0, 50.0]]) +tf.debugging.assert_near(c, expected) + +print("matmul_result:") +print(c.numpy()) +PY + +ok "TensorFlow 基础张量计算正常" + +section "7. Keras 模型构建、训练、推理检查" + +"${PY}" - <<'PY' +import os +import tempfile + +import numpy as np +import tensorflow as tf + +np.random.seed(123) +tf.random.set_seed(123) + +# 构造简单训练数据:y = 2x + 1 +x = np.array([[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32) +y = np.array([[1.0], [3.0], [5.0], [7.0], [9.0], [11.0]], dtype=np.float32) + +model = tf.keras.Sequential([ + tf.keras.layers.Input(shape=(1,)), + tf.keras.layers.Dense(8, activation="relu"), + tf.keras.layers.Dense(1), +]) + +model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), + loss="mse", +) + +history = model.fit(x, y, epochs=5, batch_size=2, verbose=0) + +pred = model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + +assert pred.shape == (1, 1), f"推理结果 shape 异常:{pred.shape}" +assert np.isfinite(pred).all(), "推理结果包含 NaN 或 Inf" + +print("final_loss:", float(history.history["loss"][-1])) +print("prediction_shape:", pred.shape) +print("prediction_value:", float(pred[0][0])) + +with tempfile.TemporaryDirectory() as tmp_dir: + keras_path = os.path.join(tmp_dir, "tiny_model.keras") + model.save(keras_path) + + loaded_model = tf.keras.models.load_model(keras_path) + loaded_pred = loaded_model.predict(np.array([[6.0]], dtype=np.float32), verbose=0) + + assert loaded_pred.shape == (1, 1), f"加载模型推理 shape 异常:{loaded_pred.shape}" + assert np.isfinite(loaded_pred).all(), "加载模型推理结果包含 NaN 或 Inf" + + print("keras_save_load: ok") +PY + +ok "Keras 模型训练、推理、保存、加载正常" + +section "8. tf.data 数据管道检查" + +"${PY}" - <<'PY' +import tensorflow as tf + +dataset = tf.data.Dataset.from_tensor_slices( + ( + tf.constant([[1.0], [2.0], [3.0], [4.0]]), + tf.constant([[2.0], [4.0], [6.0], [8.0]]), + ) +) + +dataset = dataset.batch(2).prefetch(tf.data.AUTOTUNE) + +batch_count = 0 +for xb, yb in dataset: + assert xb.shape[0] <= 2 + assert yb.shape[0] <= 2 + batch_count += 1 + +assert batch_count == 2, f"batch 数量异常: {batch_count}" + +print("tf_data_batch_count:", batch_count) +PY + +ok "tf.data 数据管道正常" + +section "测试结果" +ok "所有 TensorFlow 检查通过" +IN_CONTAINER + +ok "宿主机侧 docker run 验证完成" \ No newline at end of file diff --git a/frameworks/tensorflow/2.21.0/warring_log.png b/frameworks/tensorflow/2.21.0/warring_log.png new file mode 100644 index 0000000000000000000000000000000000000000..1461c0cedad8d45c8d32554de19c341ba88d37bc Binary files /dev/null and b/frameworks/tensorflow/2.21.0/warring_log.png differ