From c579d4e41985ace1dbb10015800a2a59e8a281d4 Mon Sep 17 00:00:00 2001 From: chenxingqiang Date: Wed, 3 Dec 2025 19:44:50 +0800 Subject: [PATCH] =?UTF-8?q?[Level=203=20=E6=96=87=E6=A1=A3=E5=BC=80?= =?UTF-8?q?=E5=8F=91]=20Complete=20JIT=20compilation=20tutorial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add comprehensive JIT compilation documentation - Cover basic usage, decorators, and compilation options - Include profiling and best practices - Add troubleshooting guide For MACA GPU development on MetaX C500 --- docs/tutorials/jit_compilation.md | 169 +++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) diff --git a/docs/tutorials/jit_compilation.md b/docs/tutorials/jit_compilation.md index 108bc9d2..db5d5178 100644 --- a/docs/tutorials/jit_compilation.md +++ b/docs/tutorials/jit_compilation.md @@ -1,2 +1,169 @@ Just In Time Compilation -========================= +======================== + +
+Author: Competition Participant +
+ +## Overview + +TileLang provides Just-In-Time (JIT) compilation capabilities that allow you to compile GPU kernels at runtime. This tutorial explains how to use JIT compilation effectively for MACA GPU development. + +## Basic JIT Compilation + +### Using `tilelang.compile` + +The primary way to JIT compile a TileLang kernel: + +```python +import tilelang +import tilelang.language as T + +@T.prim_func +def my_kernel(A: T.Tensor((M, N), "float"), B: T.Tensor((M, N), "float")): + with T.Kernel(T.ceildiv(M, 32), T.ceildiv(N, 32), threads=128) as (bx, by): + # kernel implementation + pass + +# JIT compile for MACA GPU +kernel = tilelang.compile( + my_kernel, + out_idx=-1, # Output tensor index + target="maca", # Target: "maca", "cuda", or "cpu" + execution_backend="cython" +) + +# Execute the compiled kernel +result = kernel(input_tensor) +``` + +### Using the `@tilelang.jit` Decorator + +For a more Pythonic approach, use the JIT decorator: + +```python +import tilelang + +@tilelang.jit(out_idx=-1, target="maca") +def matmul(M, N, K, block_M, block_N, block_K): + @T.prim_func + def main(A: T.Tensor((M, K), "float16"), + B: T.Tensor((K, N), "float16"), + C: T.Tensor((M, N), "float16")): + # Implementation + pass + return main + +# The kernel is compiled on first call +c = matmul(a, b) +``` + +## Compilation Options + +### Target Selection + +| Target | Description | +|--------|-------------| +| `"maca"` | MetaX MACA GPU (e.g., C500) | +| `"cuda"` | NVIDIA CUDA GPU | +| `"cpu"` | CPU backend | + +### Execution Backends + +| Backend | Description | +|---------|-------------| +| `"cython"` | Default, good for most cases | +| `"dlpack"` | DLPack interface for interoperability | + +### Pass Configurations + +```python +kernel = tilelang.compile( + func, + target="maca", + pass_configs={ + "tl.disable_tma_lower": True, # Disable TMA for older GPUs + } +) +``` + +## Caching + +TileLang automatically caches compiled kernels. To benefit from caching: + +```python +# Kernels with same parameters are cached +kernel1 = tilelang.compile(func, target="maca") +kernel2 = tilelang.compile(func, target="maca") # Returns cached version +``` + +## Profiling Compiled Kernels + +```python +kernel = tilelang.compile(func, out_idx=-1, target="maca") + +# Get profiler +profiler = kernel.get_profiler() + +# Validate correctness +profiler.assert_allclose(ref_func, rtol=1e-2, atol=1e-2) + +# Benchmark performance +latency = profiler.do_bench(warmup=100) +print(f"Latency: {latency} ms") +``` + +## Example: Complete JIT Workflow + +```python +import torch +import tilelang +import tilelang.language as T + +def vector_add(N, block_size): + @T.prim_func + def main(A: T.Tensor((N,), "float"), + B: T.Tensor((N,), "float"), + C: T.Tensor((N,), "float")): + with T.Kernel(T.ceildiv(N, block_size), threads=128) as bx: + for i in T.Parallel(block_size): + idx = bx * block_size + i + C[idx] = A[idx] + B[idx] + return main + +# Compile +N = 1024 +func = vector_add(N, 32) +kernel = tilelang.compile(func, out_idx=-1, target="maca") + +# Execute +a = torch.randn(N, device="cuda") +b = torch.randn(N, device="cuda") +c = kernel(a, b) + +# Validate +assert torch.allclose(c, a + b) +print("Success!") +``` + +## Best Practices + +1. **Reuse compiled kernels**: Store compiled kernels and reuse them +2. **Use appropriate block sizes**: Match block sizes to your GPU architecture +3. **Profile before optimizing**: Use the profiler to identify bottlenecks +4. **Test correctness first**: Always validate against a reference implementation + +## Troubleshooting + +### Common Issues + +| Issue | Solution | +|-------|----------| +| Compilation error | Check TileLang syntax and tensor shapes | +| Runtime error | Ensure tensors are on correct device | +| Poor performance | Try different block sizes and configurations | + +## Further Reading + +- [Auto-Tuning](auto_tuning.md) - Automatically find optimal configurations +- [Debug Tools](debug_tools_for_tilelang.md) - Debugging TileLang kernels -- Gitee