Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ TRTEngine::TRTEngine(
num_io = std::make_pair(inputs_size, outputs);
}

// Reconstruct optimization-profile info (count + per-profile ranges) from the
// TRT API so multi-profile selection works for any loaded engine.
this->setup_optimization_profiles();

#ifndef NDEBUG
this->enable_profiling();
#endif
Expand Down Expand Up @@ -512,6 +516,117 @@ void TRTEngine::reset_captured_graph() {
cudagraph.reset();
}

void TRTEngine::setup_optimization_profiles() {
num_optimization_profiles = cuda_engine->getNbOptimizationProfiles();
profile_dynamic_dims.clear();
is_shape_inference_io.clear();
for (const auto& name : in_binding_names) {
is_shape_inference_io[name] = cuda_engine->isShapeInferenceIO(name.c_str());
}
if (num_optimization_profiles <= 1) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What not do this first to short cut everything?

return;
}
for (const auto& name : in_binding_names) {
if (is_shape_inference_io[name]) {
continue;
}
// Gather [min, max] for every dim across every profile: dim -> [profile] -> (min, max).
std::vector<std::vector<std::pair<int64_t, int64_t>>> per_dim;
for (int64_t p = 0; p < num_optimization_profiles; ++p) {
auto dmin =
cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMIN);
auto dmax =
cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMAX);
if (per_dim.empty()) {
per_dim.resize(dmin.nbDims);
}
for (int d = 0; d < dmin.nbDims; ++d) {
per_dim[d].push_back(std::make_pair(dmin.d[d], dmax.d[d]));
}
}
// Keep only dims that vary within a profile (min != max) or differ across
// profiles; a dim that is the same fixed extent in every profile cannot
// distinguish profiles, so we skip it (TRT validates it at setInputShape).
auto& dynamic_dims = profile_dynamic_dims[name];
for (int32_t d = 0; d < static_cast<int32_t>(per_dim.size()); ++d) {
const auto& ranges = per_dim[d];
bool is_dynamic = false;
for (const auto& r : ranges) {
if (r.first != r.second || r != ranges[0]) {
is_dynamic = true;
break;
}
}
if (is_dynamic) {
dynamic_dims.push_back({d, ranges});
}
}
}
}

void TRTEngine::set_active_profile(int64_t profile_index) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind of feel like this function should take stream as argument and put it on the caller to give the right stream instead of getting the current stream in the function body

if (num_optimization_profiles <= 1) {
return;
}
if (profile_index == active_profile_index) {
return;
}
// set the optimization on the default stream so that the enqueue stream will sync with it before running the engine
auto stream = c10::cuda::getCurrentCUDAStream(device_info.id);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work with the green context pr?

// setOptimizationProfileAsync returns false for an out-of-range index; the
// index is validated upstream in TorchTensorRTModule.resolve_profile_index.
TORCHTRT_CHECK(
exec_ctx->setOptimizationProfileAsync(static_cast<int32_t>(profile_index), stream.stream()),
"Failed to switch to optimization profile index " << profile_index);
stream.synchronize();
active_profile_index = profile_index;
// A profile switch invalidates any captured CUDA graph and changes the
// context state, so force re-record / shape re-inference on the next call.
runtime_states.context_changed = true;
reset_captured_graph();
shape_key = "None";
LOG_DEBUG("Switched to optimization profile index " << profile_index);
}

int64_t TRTEngine::auto_select_profile(const std::vector<at::Tensor>& inputs) {
// Lazy selection: scan profiles in index order and return the first one whose
// [min, max] ranges contain every input's dynamic dims. Static dims are not
// cached (they cannot distinguish profiles) and are validated later by TRT at
// setInputShape.
for (int64_t p = 0; p < num_optimization_profiles; ++p) {
bool fits = true;
for (size_t i = 0; i < in_binding_names.size() && fits; ++i) {
const auto& name = in_binding_names[i];
if (i >= inputs.size() || is_shape_inference_io[name]) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this function doesnt make a ton of sense and could potentially lead to thrashing.

Fundamentally in the auto mode I think first we should see if the active profile is valid for the set of inputs.

The mechanism we use to map from the list to an input binding name should be identical to the code we use in execute_engine. Its fundamentally the same job. In fact we should only do it once and reuse this mapping result. There should be some map from index in the input list to a name.

If the inputs fit we should short cut and return.

I also think because of this the responsibility of changing the opt profile should be on these methods and not the caller. That way we can just no-op if it fits.

continue;
}
auto dims_it = profile_dynamic_dims.find(name);
if (dims_it == profile_dynamic_dims.end()) {
continue;
}
auto sizes = inputs[i].sizes();
for (const auto& dyn : dims_it->second) {
if (dyn.dim_index < static_cast<int32_t>(sizes.size())) {
int64_t lo = dyn.profile_ranges[p].first;
int64_t hi = dyn.profile_ranges[p].second;
int64_t sz = sizes[dyn.dim_index];
if (!(lo <= sz && sz <= hi)) {
fits = false;
break;
}
}
}
}
if (fits) {
return p;
}
}
TORCHTRT_THROW_ERROR(
"No optimization profile matches the input shapes. Fix the input shapes or pin a profile "
"explicitly via optimization_profile(module, index).");
return 0; // unreachable
}

void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
if (new_strategy != this->resource_allocation_strategy) {
this->resource_allocation_strategy = new_strategy;
Expand Down
35 changes: 35 additions & 0 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,41 @@ struct TRTEngine : torch::CustomClassHolder {
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;

// --- Multiple optimization profiles ---
// State and helpers mirror the Python runtime (TRTEngine in _TRTEngine.py) so
// the C++ and Python runtimes are interchangeable: the same attribute and
// method names are exposed via torchbind in register_jit_hooks.cpp
// (``num_optimization_profiles``, ``_active_profile_index``,
// ``_auto_select_profiles``, ``set_active_profile``). Index validation lives
// in the runtime-agnostic TorchTensorRTModule.resolve_profile_index.
int64_t num_optimization_profiles = 1; // cuda_engine->getNbOptimizationProfiles()
int64_t active_profile_index = 0; // profile currently loaded in exec_ctx
bool auto_select_profiles = false; // opt-in shape-based selection (per call)
// A single input dimension whose extent varies across or within optimization
// profiles, paired with its [min, max] range for each profile index. Dims that
// are a fixed identical extent in every profile are NOT stored, so selection
// only inspects dims that can actually distinguish profiles.
struct DynamicProfileDim {
int32_t dim_index;
std::vector<std::pair<int64_t, int64_t>> profile_ranges; // indexed by profile index
};
// input name -> only its dynamic dims (static dims are omitted). Cached from
// the TRT API.
std::unordered_map<std::string, std::vector<DynamicProfileDim>> profile_dynamic_dims;
std::unordered_map<std::string, bool> is_shape_inference_io;

// Cache profile count + per-profile dim ranges purely from the TRT API
// (getNbOptimizationProfiles / getProfileShape) so selection works for any
// loaded engine with no extra serialized metadata.
void setup_optimization_profiles();
// Switch the active TRT optimization profile (idempotent).
void set_active_profile(int64_t profile_index);
// Lazy / first-working: first profile whose [min, max] fits all input shapes.
// Called internally from the execute_engine run paths (guarded by
// num_optimization_profiles > 1 && auto_select_profiles); manual pins are
// applied eagerly via set_active_profile.
int64_t auto_select_profile(const std::vector<at::Tensor>& inputs);

// Single placeholder buffer for empty tensor inputs (allocated once, reused)
void* empty_tensor_placeholder = nullptr;

Expand Down
47 changes: 33 additions & 14 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr

auto run_standard_execution = [&]() {
bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
bool shape_changed = _validate_shapes(inputs, compiled_engine);

// Resolve the execution stream first so the optimization-profile switch below
// is issued on the same stream the engine will actually enqueue on. This keeps
// the switch consistent with execution and, for a default-stream caller, keeps
// it on the pool stream instead of serializing the device on the default stream.
auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
auto default_stream = compiled_engine->default_stream;
auto previous_engine_stream = compiled_engine->engine_stream;
Expand All @@ -260,6 +262,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
compiled_engine->runtime_states.context_changed = true;
}

// Auto-select the optimization profile from input shapes before validating
// shapes, so a profile switch's context_changed flag and shape_key reset are
// observed below. The switch runs on engine_stream (resolved above). Only
// auto-selection runs per call; manual pins are applied eagerly.
if (compiled_engine->num_optimization_profiles > 1 && compiled_engine->auto_select_profiles) {
compiled_engine->set_active_profile(compiled_engine->auto_select_profile(inputs));
}
bool shape_changed = _validate_shapes(inputs, compiled_engine);

// Whether cudagraphs needs to record the graph on this pass
auto result = compiled_engine->runtime_states.set_runtime_states(
cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
Expand Down Expand Up @@ -401,6 +412,26 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
};

auto run_output_allocator = [&]() {
// Resolve the execution stream first so the optimization-profile switch below
// is issued on the same stream the engine will actually enqueue on.
auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
auto default_stream = compiled_engine->default_stream;
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
bool caller_on_default = (compiled_engine->caller_stream == default_stream);
if (caller_on_default) {
compiled_engine->engine_stream = compiled_engine->owned_pool_stream;
} else {
// Honor caller's non-default stream so its scheduling choice (e.g. SM
// partitioning via a CUDA Green Context) is preserved end to end.
compiled_engine->engine_stream = compiled_engine->caller_stream;
}

// Auto-select the optimization profile from input shapes before binding
// them. The switch runs on engine_stream (resolved above). Only
// auto-selection runs per call; manual pins are applied eagerly.
if (compiled_engine->num_optimization_profiles > 1 && compiled_engine->auto_select_profiles) {
compiled_engine->set_active_profile(compiled_engine->auto_select_profile(inputs));
}
{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
Expand Down Expand Up @@ -429,18 +460,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
create_output_allocator(compiled_engine);
}

auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device();
auto default_stream = compiled_engine->default_stream;
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
bool caller_on_default = (compiled_engine->caller_stream == default_stream);
if (caller_on_default) {
compiled_engine->engine_stream = compiled_engine->owned_pool_stream;
} else {
// Honor caller's non-default stream so its scheduling choice (e.g. SM
// partitioning via a CUDA Green Context) is preserved end to end.
compiled_engine->engine_stream = compiled_engine->caller_stream;
}

compiled_engine->record_active_input_tensor_stream_usage(compiled_engine->engine_stream);

{ // Engine Execution (execute on engine stream)
Expand Down
7 changes: 7 additions & 0 deletions core/runtime/register_jit_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
.def("reset_captured_graph", &TRTEngine::reset_captured_graph)
.def("set_output_tensors_as_unowned", &TRTEngine::set_output_tensors_as_unowned)
.def("are_output_tensors_unowned", &TRTEngine::are_output_tensors_unowned)
// Multiple optimization profiles. Names match the Python runtime
// (_TRTEngine.py) so both runtimes are interchangeable behind
// TorchTensorRTModule / the optimization_profile context manager.
.def("set_active_profile", &TRTEngine::set_active_profile)
.def_readonly("num_optimization_profiles", &TRTEngine::num_optimization_profiles)
.def_readonly("_active_profile_index", &TRTEngine::active_profile_index)
.def_readwrite("_auto_select_profiles", &TRTEngine::auto_select_profiles)
.def(
"use_dynamically_allocated_resources",
[](const c10::intrusive_ptr<TRTEngine>& self, bool dynamic) -> void {
Expand Down
6 changes: 5 additions & 1 deletion docsrc/tutorials/runtime_opt/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@ Runtime Optimization
=====================

Optimize inference throughput and latency: CUDA Graphs for kernel-replay,
pre-allocated output buffers, and choosing the Python vs C++ TRT execution path.
pre-allocated output buffers, multiple optimization profiles for distinct shape
regimes (e.g. LLM prefill/decode), and choosing the Python vs C++ TRT execution
path.

.. toctree::
:maxdepth: 1

cuda_graphs
Example: Torch Export with Cudagraphs <../_rendered_examples/dynamo/torch_export_cudagraphs>
Example: Pre-allocated output buffer <../_rendered_examples/dynamo/pre_allocated_output_example>
multi_optimization_profiles
Example: Multiple optimization profiles (prefill/decode) <../_rendered_examples/dynamo/multi_optimization_profiles>
Python vs C++ runtime <python_runtime>
Loading
Loading