PyTorch AOTInductor Hybrid Lowering

05-28-202605-28-2026 blog 8 minutes read (About 1224 words) visits

Introduction

In my previous blog post “PyTorch Fake Export”, I mentioned that PyTorch exported program allows operators on different devices and explicit device transfer operators in the same graph. This means that a PyTorch exported program can have a hybrid device execution plan, where some operators are executed on CPU and some operators are executed on GPU, with explicit device transfers in between.

In this blog post, I would like to discuss how PyTorch AOTInductor can compile a PyTorch exported program with a hybrid device execution plan into a single executable package that can run the whole model end-to-end, with part of the operators running on CPU and part of the operators running on GPU, without needing to manually split the model into separate CPU and GPU sub-models.

PyTorch AOTInductor Hybrid Lowering

The following example follows the one used in my previous blog post “PyTorch Fake Export”. However, instead of exporting fake models, this time we have to export the model with actual data. The example input tensors can remain fake tensors though. The exported programs are compiled with AOTInductor using the torch._inductor.aoti_compile_and_package API, which produces a single executable package for each exported program. The compiled AOTInductor package can be loaded with torch._inductor.aoti_load_package API, which returns a Python callable that can be invoked with real input tensors to run the model end-to-end.

test_torch_hybrid_lowering.py

from pathlib import Path

import torch
import torch.nn as nn
import torch.profiler
from torch._inductor import aoti_compile_and_package, aoti_load_package
from torch._subclasses.fake_tensor import FakeTensorMode


class MLP(nn.Module):
    """MLP configurable across CPU, GPU, or a CPU-GPU hybrid split.

    fc1 (+ GELU) is placed on *fc1_device*; fc2 is placed on *fc2_device*.
    When the two devices differ the forward pass inserts an explicit device
    transfer, preserved as an aten._to_copy node in the exported graph.
    When they are the same the transfer is a no-op.
    """

    def __init__(
        self,
        in_features: int,
        hidden_features: int,
        out_features: int,
        fc1_device: torch.device = torch.device("cpu"),
        fc2_device: torch.device = torch.device("cpu")
    ) -> None:
        super().__init__()
        with torch.device(fc1_device):
            self.fc1 = nn.Linear(in_features, hidden_features)
            self.act = nn.GELU()
        with torch.device(fc2_device):
            self.fc2 = nn.Linear(hidden_features, out_features)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = self.act(self.fc1(x))
        # Transfer to fc2's device (no-op when fc1 and fc2 share the same device).
        h = h.to(self.fc2.weight.device)
        return self.fc2(h)


def aoti_compile(model: nn.Module, x: torch.Tensor,
                 package_path: str) -> object:
    """Export and AOTInductor-compile a model.

    A fake input with the same shape/dtype/device as x is used so that
    torch.export can trace the graph without allocating real activation memory.
    Works for any device (cpu, cuda) and any model topology.
    """
    with FakeTensorMode():
        fake_input = torch.empty(x.shape, dtype=x.dtype, device=x.device)
    ep = torch.export.export(model, (fake_input, ))
    compiled_package = aoti_compile_and_package(ep, package_path=package_path)
    return aoti_load_package(compiled_package, run_single_threaded=True)


def profile_runner(runner,
                   x: torch.Tensor,
                   trace_path: str,
                   label: str,
                   warmup: int = 3,
                   steps: int = 5) -> None:
    """Profile an AOTI runner and export a Chrome trace to *trace_path*.

    Note: AOTI runners call compiled C++ directly, bypassing the ATen
    dispatcher's profiling hooks. As a result, no cpu_op events (e.g.
    aten::mm, aten::gelu) appear in the trace — the runner executes as an
    opaque native call from the profiler's perspective. What the trace does
    capture are CUDA runtime events (cudaLaunchKernel) and, when CUPTI is
    available, actual GPU kernel execution on the device timeline.
    """
    activities = [
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
    schedule = torch.profiler.schedule(wait=0,
                                       warmup=warmup,
                                       active=steps,
                                       repeat=1)
    with torch.profiler.profile(
            activities=activities,
            schedule=schedule,
            record_shapes=True,
            with_flops=True,
    ) as prof:
        for step in range(warmup + steps):
            with torch.profiler.record_function(f"step_{step}"):
                runner(x)
            prof.step()
    prof.export_chrome_trace(trace_path)
    print(f"{label} trace written to {trace_path}")


if __name__ == "__main__":

    cpu_device = torch.device("cpu")
    gpu_device = torch.device("cuda")

    artifacts_dir = Path(__file__).parent / "aoti_artifacts"
    artifacts_dir.mkdir(exist_ok=True)

    model_cpu = MLP(128, 256, 10, fc1_device=cpu_device,
                    fc2_device=cpu_device).eval()
    x_cpu = torch.randn(4, 128, device=cpu_device)
    runner_cpu = aoti_compile(model_cpu, x_cpu, str(artifacts_dir / "cpu.pt2"))
    torch.testing.assert_close(runner_cpu(x_cpu), model_cpu(x_cpu))
    print("AOTInductor compile (CPU) succeeded.")
    profile_runner(runner_cpu, x_cpu, str(artifacts_dir / "cpu_trace.json"),
                   "AOTInductor (CPU)")

    model_gpu = MLP(128, 256, 10, fc1_device=gpu_device,
                    fc2_device=gpu_device).eval()
    x_cuda = torch.randn(4, 128, device=gpu_device)
    runner_gpu = aoti_compile(model_gpu, x_cuda,
                              str(artifacts_dir / "cuda.pt2"))
    torch.testing.assert_close(runner_gpu(x_cuda), model_gpu(x_cuda))
    print("AOTInductor compile (GPU) succeeded.")
    profile_runner(runner_gpu, x_cuda, str(artifacts_dir / "cuda_trace.json"),
                   "AOTInductor (GPU)")

    model_hybrid = MLP(128,
                       256,
                       10,
                       fc1_device=cpu_device,
                       fc2_device=gpu_device).eval()
    x_hybrid = torch.randn(4, 128, device=cpu_device)
    runner_hybrid = aoti_compile(model_hybrid, x_hybrid,
                                 str(artifacts_dir / "hybrid.pt2"))
    torch.testing.assert_close(runner_hybrid(x_hybrid), model_hybrid(x_hybrid))
    print("AOTInductor compile (CPU-GPU hybrid) succeeded.")
    profile_runner(runner_hybrid, x_hybrid,
                   str(artifacts_dir / "hybrid_trace.json"),
                   "AOTInductor (CPU-GPU hybrid)")

Using NVIDIA NGC Docker container nvcr.io/nvidia/pytorch:26.04-py3, we could run the above script to export the MLP concrete model for CPU, GPU, and CPU-GPU hybrid device configurations, and compile the exported programs with AOTInductor. The CPU and GPU hybrid execution of the hybrid AOTInductor engine can be verified by examining the profiling traces.

$ python test_torch_hybrid_lowering.py
/usr/lib/python3.12/copyreg.py:99: FutureWarning: `isinstance(treespec, LeafSpec)` is deprecated, use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.
  return cls.__new__(cls, *args)
/usr/local/lib/python3.12/dist-packages/torch/utils/_config_module.py:540: FutureWarning: torch._dynamo.config.skip_code_recursive_on_recompile_limit_hit is deprecated and does not do anything. It will be removed in a future version of PyTorch.
  config[key] = copy.deepcopy(getattr(self, key))
AOTInductor compile (CPU) succeeded.
/usr/local/lib/python3.12/dist-packages/torch/profiler/profiler.py:229: UserWarning: Warning: Profiler clears events at the end of each cycle.Only events from the current cycle will be reported.To keep events across cycles, set acc_events=True.
  _warn_once(
AOTInductor (CPU) trace written to /mnt/aoti_artifacts/cpu_trace.json
/usr/lib/python3.12/copyreg.py:99: FutureWarning: `isinstance(treespec, LeafSpec)` is deprecated, use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.
  return cls.__new__(cls, *args)
AOTInductor compile (GPU) succeeded.
AOTInductor (GPU) trace written to /mnt/aoti_artifacts/cuda_trace.json
/usr/lib/python3.12/copyreg.py:99: FutureWarning: `isinstance(treespec, LeafSpec)` is deprecated, use `isinstance(treespec, TreeSpec) and treespec.is_leaf()` instead.
  return cls.__new__(cls, *args)
AOTInductor compile (CPU-GPU hybrid) succeeded.
AOTInductor (CPU-GPU hybrid) trace written to /mnt/aoti_artifacts/hybrid_trace.json

References

AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models

PyTorch AOTInductor Hybrid Lowering

https://leimao.github.io/blog/PyTorch-AOTInductor-Hybrid-Lowering/

Author

Lei Mao

Posted on

05-28-2026

Updated on

05-28-2026

Licensed under

Deep Learning Inference,

PyTorch

PyTorch AOTInductor Hybrid Lowering

Introduction

PyTorch AOTInductor Hybrid Lowering

References

Author

Posted on

Updated on

Licensed under

Like this article? Support the author with

Comments

Advertisement

Catalogue