1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
| from pathlib import Path
import torch import torch.nn as nn import torch.profiler from torch._inductor import aoti_compile_and_package, aoti_load_package from torch._subclasses.fake_tensor import FakeTensorMode
class MLP(nn.Module): """MLP configurable across CPU, GPU, or a CPU-GPU hybrid split.
fc1 (+ GELU) is placed on *fc1_device*; fc2 is placed on *fc2_device*. When the two devices differ the forward pass inserts an explicit device transfer, preserved as an aten._to_copy node in the exported graph. When they are the same the transfer is a no-op. """
def __init__( self, in_features: int, hidden_features: int, out_features: int, fc1_device: torch.device = torch.device("cpu"), fc2_device: torch.device = torch.device("cpu") ) -> None: super().__init__() with torch.device(fc1_device): self.fc1 = nn.Linear(in_features, hidden_features) self.act = nn.GELU() with torch.device(fc2_device): self.fc2 = nn.Linear(hidden_features, out_features)
def forward(self, x: torch.Tensor) -> torch.Tensor: h = self.act(self.fc1(x)) h = h.to(self.fc2.weight.device) return self.fc2(h)
def aoti_compile(model: nn.Module, x: torch.Tensor, package_path: str) -> object: """Export and AOTInductor-compile a model.
A fake input with the same shape/dtype/device as x is used so that torch.export can trace the graph without allocating real activation memory. Works for any device (cpu, cuda) and any model topology. """ with FakeTensorMode(): fake_input = torch.empty(x.shape, dtype=x.dtype, device=x.device) ep = torch.export.export(model, (fake_input, )) compiled_package = aoti_compile_and_package(ep, package_path=package_path) return aoti_load_package(compiled_package, run_single_threaded=True)
def profile_runner(runner, x: torch.Tensor, trace_path: str, label: str, warmup: int = 3, steps: int = 5) -> None: """Profile an AOTI runner and export a Chrome trace to *trace_path*.
Note: AOTI runners call compiled C++ directly, bypassing the ATen dispatcher's profiling hooks. As a result, no cpu_op events (e.g. aten::mm, aten::gelu) appear in the trace — the runner executes as an opaque native call from the profiler's perspective. What the trace does capture are CUDA runtime events (cudaLaunchKernel) and, when CUPTI is available, actual GPU kernel execution on the device timeline. """ activities = [ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ] schedule = torch.profiler.schedule(wait=0, warmup=warmup, active=steps, repeat=1) with torch.profiler.profile( activities=activities, schedule=schedule, record_shapes=True, with_flops=True, ) as prof: for step in range(warmup + steps): with torch.profiler.record_function(f"step_{step}"): runner(x) prof.step() prof.export_chrome_trace(trace_path) print(f"{label} trace written to {trace_path}")
if __name__ == "__main__":
cpu_device = torch.device("cpu") gpu_device = torch.device("cuda")
artifacts_dir = Path(__file__).parent / "aoti_artifacts" artifacts_dir.mkdir(exist_ok=True)
model_cpu = MLP(128, 256, 10, fc1_device=cpu_device, fc2_device=cpu_device).eval() x_cpu = torch.randn(4, 128, device=cpu_device) runner_cpu = aoti_compile(model_cpu, x_cpu, str(artifacts_dir / "cpu.pt2")) torch.testing.assert_close(runner_cpu(x_cpu), model_cpu(x_cpu)) print("AOTInductor compile (CPU) succeeded.") profile_runner(runner_cpu, x_cpu, str(artifacts_dir / "cpu_trace.json"), "AOTInductor (CPU)")
model_gpu = MLP(128, 256, 10, fc1_device=gpu_device, fc2_device=gpu_device).eval() x_cuda = torch.randn(4, 128, device=gpu_device) runner_gpu = aoti_compile(model_gpu, x_cuda, str(artifacts_dir / "cuda.pt2")) torch.testing.assert_close(runner_gpu(x_cuda), model_gpu(x_cuda)) print("AOTInductor compile (GPU) succeeded.") profile_runner(runner_gpu, x_cuda, str(artifacts_dir / "cuda_trace.json"), "AOTInductor (GPU)")
model_hybrid = MLP(128, 256, 10, fc1_device=cpu_device, fc2_device=gpu_device).eval() x_hybrid = torch.randn(4, 128, device=cpu_device) runner_hybrid = aoti_compile(model_hybrid, x_hybrid, str(artifacts_dir / "hybrid.pt2")) torch.testing.assert_close(runner_hybrid(x_hybrid), model_hybrid(x_hybrid)) print("AOTInductor compile (CPU-GPU hybrid) succeeded.") profile_runner(runner_hybrid, x_hybrid, str(artifacts_dir / "hybrid_trace.json"), "AOTInductor (CPU-GPU hybrid)")
|