pytorch · cehongwang · May 5, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 8, 2026
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -251,6 +251,10 @@ TRTEngine::TRTEngine(
     num_io = std::make_pair(inputs_size, outputs);
   }
 
+  // Reconstruct optimization-profile info (count + per-profile ranges) from the
+  // TRT API so multi-profile selection works for any loaded engine.
+  this->setup_optimization_profiles();
+
 #ifndef NDEBUG
   this->enable_profiling();
 #endif
@@ -512,6 +516,117 @@ void TRTEngine::reset_captured_graph() {
   cudagraph.reset();
 }
 
+void TRTEngine::setup_optimization_profiles() {
+  num_optimization_profiles = cuda_engine->getNbOptimizationProfiles();
+  profile_dynamic_dims.clear();
+  is_shape_inference_io.clear();
+  for (const auto& name : in_binding_names) {
+    is_shape_inference_io[name] = cuda_engine->isShapeInferenceIO(name.c_str());
+  }
+  if (num_optimization_profiles <= 1) {
+    return;
+  }
+  for (const auto& name : in_binding_names) {
+    if (is_shape_inference_io[name]) {
+      continue;
+    }
+    // Gather [min, max] for every dim across every profile: dim -> [profile] -> (min, max).
+    std::vector<std::vector<std::pair<int64_t, int64_t>>> per_dim;
+    for (int64_t p = 0; p < num_optimization_profiles; ++p) {
+      auto dmin =
+          cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMIN);
+      auto dmax =
+          cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMAX);
+      if (per_dim.empty()) {
+        per_dim.resize(dmin.nbDims);
+      }
+      for (int d = 0; d < dmin.nbDims; ++d) {
+        per_dim[d].push_back(std::make_pair(dmin.d[d], dmax.d[d]));
+      }
+    }
+    // Keep only dims that vary within a profile (min != max) or differ across
+    // profiles; a dim that is the same fixed extent in every profile cannot
+    // distinguish profiles, so we skip it (TRT validates it at setInputShape).
+    auto& dynamic_dims = profile_dynamic_dims[name];
+    for (int32_t d = 0; d < static_cast<int32_t>(per_dim.size()); ++d) {
+      const auto& ranges = per_dim[d];
+      bool is_dynamic = false;
+      for (const auto& r : ranges) {
+        if (r.first != r.second || r != ranges[0]) {
+          is_dynamic = true;
+          break;
+        }
+      }
+      if (is_dynamic) {
+        dynamic_dims.push_back({d, ranges});
+      }
+    }
+  }
+}
+
+void TRTEngine::set_active_profile(int64_t profile_index) {
+  if (num_optimization_profiles <= 1) {
+    return;
+  }
+  if (profile_index == active_profile_index) {
+    return;
+  }
+  // set the optimization on the default stream so that the enqueue stream will sync with it before running the engine
+  auto stream = c10::cuda::getCurrentCUDAStream(device_info.id);
+  // setOptimizationProfileAsync returns false for an out-of-range index; the
+  // index is validated upstream in TorchTensorRTModule.resolve_profile_index.
+  TORCHTRT_CHECK(
+      exec_ctx->setOptimizationProfileAsync(static_cast<int32_t>(profile_index), stream.stream()),
+      "Failed to switch to optimization profile index " << profile_index);
+  stream.synchronize();
+  active_profile_index = profile_index;
+  // A profile switch invalidates any captured CUDA graph and changes the
+  // context state, so force re-record / shape re-inference on the next call.
+  runtime_states.context_changed = true;
+  reset_captured_graph();
+  shape_key = "None";
+  LOG_DEBUG("Switched to optimization profile index " << profile_index);
+}
+
+int64_t TRTEngine::auto_select_profile(const std::vector<at::Tensor>& inputs) {
+  // Lazy selection: scan profiles in index order and return the first one whose
+  // [min, max] ranges contain every input's dynamic dims. Static dims are not
+  // cached (they cannot distinguish profiles) and are validated later by TRT at
+  // setInputShape.
+  for (int64_t p = 0; p < num_optimization_profiles; ++p) {
+    bool fits = true;
+    for (size_t i = 0; i < in_binding_names.size() && fits; ++i) {
+      const auto& name = in_binding_names[i];
+      if (i >= inputs.size() || is_shape_inference_io[name]) {
+        continue;
+      }
+      auto dims_it = profile_dynamic_dims.find(name);
+      if (dims_it == profile_dynamic_dims.end()) {
+        continue;
+      }
+      auto sizes = inputs[i].sizes();
+      for (const auto& dyn : dims_it->second) {
+        if (dyn.dim_index < static_cast<int32_t>(sizes.size())) {
+          int64_t lo = dyn.profile_ranges[p].first;
+          int64_t hi = dyn.profile_ranges[p].second;
+          int64_t sz = sizes[dyn.dim_index];
+          if (!(lo <= sz && sz <= hi)) {
+            fits = false;
+            break;
+          }
+        }
+      }
+    }
+    if (fits) {
+      return p;
+    }
+  }
+  TORCHTRT_THROW_ERROR(
+      "No optimization profile matches the input shapes. Fix the input shapes or pin a profile "
+      "explicitly via optimization_profile(module, index).");
+  return 0; // unreachable
+}
+
 void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
   if (new_strategy != this->resource_allocation_strategy) {
     this->resource_allocation_strategy = new_strategy;

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -218,6 +218,41 @@ struct TRTEngine : torch::CustomClassHolder {
   bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
 
+  // --- Multiple optimization profiles ---
+  // State and helpers mirror the Python runtime (TRTEngine in _TRTEngine.py) so
+  // the C++ and Python runtimes are interchangeable: the same attribute and
+  // method names are exposed via torchbind in register_jit_hooks.cpp
+  // (``num_optimization_profiles``, ``_active_profile_index``,
+  // ``_auto_select_profiles``, ``set_active_profile``). Index validation lives
+  // in the runtime-agnostic TorchTensorRTModule.resolve_profile_index.
+  int64_t num_optimization_profiles = 1; // cuda_engine->getNbOptimizationProfiles()
+  int64_t active_profile_index = 0; // profile currently loaded in exec_ctx
+  bool auto_select_profiles = false; // opt-in shape-based selection (per call)
+  // A single input dimension whose extent varies across or within optimization
+  // profiles, paired with its [min, max] range for each profile index. Dims that
+  // are a fixed identical extent in every profile are NOT stored, so selection
+  // only inspects dims that can actually distinguish profiles.
+  struct DynamicProfileDim {
+    int32_t dim_index;
+    std::vector<std::pair<int64_t, int64_t>> profile_ranges; // indexed by profile index
+  };
+  // input name -> only its dynamic dims (static dims are omitted). Cached from
+  // the TRT API.
+  std::unordered_map<std::string, std::vector<DynamicProfileDim>> profile_dynamic_dims;
+  std::unordered_map<std::string, bool> is_shape_inference_io;
+
+  // Cache profile count + per-profile dim ranges purely from the TRT API
+  // (getNbOptimizationProfiles / getProfileShape) so selection works for any
+  // loaded engine with no extra serialized metadata.
+  void setup_optimization_profiles();
+  // Switch the active TRT optimization profile (idempotent).
+  void set_active_profile(int64_t profile_index);
+  // Lazy / first-working: first profile whose [min, max] fits all input shapes.
+  // Called internally from the execute_engine run paths (guarded by
+  // num_optimization_profiles > 1 && auto_select_profiles); manual pins are
+  // applied eagerly via set_active_profile.
+  int64_t auto_select_profile(const std::vector<at::Tensor>& inputs);
+
   // Single placeholder buffer for empty tensor inputs (allocated once, reused)
   void* empty_tensor_placeholder = nullptr;
 

diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -241,8 +241,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   auto run_standard_execution = [&]() {
     bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
-    bool shape_changed = _validate_shapes(inputs, compiled_engine);
-
+    // Resolve the execution stream first so the optimization-profile switch below
+    // is issued on the same stream the engine will actually enqueue on. This keeps
+    // the switch consistent with execution and, for a default-stream caller, keeps
+    // it on the pool stream instead of serializing the device on the default stream.
     auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
     auto default_stream = compiled_engine->default_stream;
     auto previous_engine_stream = compiled_engine->engine_stream;
@@ -260,6 +262,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       compiled_engine->runtime_states.context_changed = true;
     }
 
+    // Auto-select the optimization profile from input shapes before validating
+    // shapes, so a profile switch's context_changed flag and shape_key reset are
+    // observed below. The switch runs on engine_stream (resolved above). Only
+    // auto-selection runs per call; manual pins are applied eagerly.
+    if (compiled_engine->num_optimization_profiles > 1 && compiled_engine->auto_select_profiles) {
+      compiled_engine->set_active_profile(compiled_engine->auto_select_profile(inputs));
+    }
+    bool shape_changed = _validate_shapes(inputs, compiled_engine);
+
     // Whether cudagraphs needs to record the graph on this pass
     auto result = compiled_engine->runtime_states.set_runtime_states(
         cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
@@ -401,6 +412,26 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   };
 
   auto run_output_allocator = [&]() {
+    // Resolve the execution stream first so the optimization-profile switch below
+    // is issued on the same stream the engine will actually enqueue on.
+    auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
+    auto default_stream = compiled_engine->default_stream;
+    compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
+    bool caller_on_default = (compiled_engine->caller_stream == default_stream);
+    if (caller_on_default) {
+      compiled_engine->engine_stream = compiled_engine->owned_pool_stream;
+    } else {
+      // Honor caller's non-default stream so its scheduling choice (e.g. SM
+      // partitioning via a CUDA Green Context) is preserved end to end.
+      compiled_engine->engine_stream = compiled_engine->caller_stream;
+    }
+
+    // Auto-select the optimization profile from input shapes before binding
+    // them. The switch runs on engine_stream (resolved above). Only
+    // auto-selection runs per call; manual pins are applied eagerly.
+    if (compiled_engine->num_optimization_profiles > 1 && compiled_engine->auto_select_profiles) {
+      compiled_engine->set_active_profile(compiled_engine->auto_select_profile(inputs));
+    }
     { // Input Setup
       std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
       if (compiled_engine->profile_execution) {
@@ -429,18 +460,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       create_output_allocator(compiled_engine);
     }
 
-    auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
-    auto default_stream = compiled_engine->default_stream;
-    compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
-    bool caller_on_default = (compiled_engine->caller_stream == default_stream);
-    if (caller_on_default) {
-      compiled_engine->engine_stream = compiled_engine->owned_pool_stream;
-    } else {
-      // Honor caller's non-default stream so its scheduling choice (e.g. SM
-      // partitioning via a CUDA Green Context) is preserved end to end.
-      compiled_engine->engine_stream = compiled_engine->caller_stream;
-    }
-
     compiled_engine->record_active_input_tensor_stream_usage(compiled_engine->engine_stream);
 
     { // Engine Execution (execute on engine stream)

diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -40,6 +40,13 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("reset_captured_graph", &TRTEngine::reset_captured_graph)
         .def("set_output_tensors_as_unowned", &TRTEngine::set_output_tensors_as_unowned)
         .def("are_output_tensors_unowned", &TRTEngine::are_output_tensors_unowned)
+        // Multiple optimization profiles. Names match the Python runtime
+        // (_TRTEngine.py) so both runtimes are interchangeable behind
+        // TorchTensorRTModule / the optimization_profile context manager.
+        .def("set_active_profile", &TRTEngine::set_active_profile)
+        .def_readonly("num_optimization_profiles", &TRTEngine::num_optimization_profiles)
+        .def_readonly("_active_profile_index", &TRTEngine::active_profile_index)
+        .def_readwrite("_auto_select_profiles", &TRTEngine::auto_select_profiles)
         .def(
             "use_dynamically_allocated_resources",
             [](const c10::intrusive_ptr<TRTEngine>& self, bool dynamic) -> void {

diff --git a/docsrc/tutorials/runtime_opt/index.rst b/docsrc/tutorials/runtime_opt/index.rst
@@ -2,12 +2,16 @@ Runtime Optimization
 =====================
 
 Optimize inference throughput and latency: CUDA Graphs for kernel-replay,
-pre-allocated output buffers, and choosing the Python vs C++ TRT execution path.
+pre-allocated output buffers, multiple optimization profiles for distinct shape
+regimes (e.g. LLM prefill/decode), and choosing the Python vs C++ TRT execution
+path.
 
 .. toctree::
    :maxdepth: 1
 
    cuda_graphs
    Example: Torch Export with Cudagraphs <../_rendered_examples/dynamo/torch_export_cudagraphs>
    Example: Pre-allocated output buffer <../_rendered_examples/dynamo/pre_allocated_output_example>
+   multi_optimization_profiles
+   Example: Multiple optimization profiles (prefill/decode) <../_rendered_examples/dynamo/multi_optimization_profiles>
    Python vs C++ runtime <python_runtime>