-
Notifications
You must be signed in to change notification settings - Fork 405
feat: multiple optimization profiles for disjoint input shape regimes #4325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: abose/dynamic-shapes-passthrough
Are you sure you want to change the base?
Changes from all commits
89443d8
b781ae4
e1cff6e
896857b
a0eeae7
917aba9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -251,6 +251,10 @@ TRTEngine::TRTEngine( | |
| num_io = std::make_pair(inputs_size, outputs); | ||
| } | ||
|
|
||
| // Reconstruct optimization-profile info (count + per-profile ranges) from the | ||
| // TRT API so multi-profile selection works for any loaded engine. | ||
| this->setup_optimization_profiles(); | ||
|
|
||
| #ifndef NDEBUG | ||
| this->enable_profiling(); | ||
| #endif | ||
|
|
@@ -512,6 +516,117 @@ void TRTEngine::reset_captured_graph() { | |
| cudagraph.reset(); | ||
| } | ||
|
|
||
| void TRTEngine::setup_optimization_profiles() { | ||
| num_optimization_profiles = cuda_engine->getNbOptimizationProfiles(); | ||
| profile_dynamic_dims.clear(); | ||
| is_shape_inference_io.clear(); | ||
| for (const auto& name : in_binding_names) { | ||
| is_shape_inference_io[name] = cuda_engine->isShapeInferenceIO(name.c_str()); | ||
| } | ||
| if (num_optimization_profiles <= 1) { | ||
| return; | ||
| } | ||
| for (const auto& name : in_binding_names) { | ||
| if (is_shape_inference_io[name]) { | ||
| continue; | ||
| } | ||
| // Gather [min, max] for every dim across every profile: dim -> [profile] -> (min, max). | ||
| std::vector<std::vector<std::pair<int64_t, int64_t>>> per_dim; | ||
| for (int64_t p = 0; p < num_optimization_profiles; ++p) { | ||
| auto dmin = | ||
| cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMIN); | ||
| auto dmax = | ||
| cuda_engine->getProfileShape(name.c_str(), static_cast<int32_t>(p), nvinfer1::OptProfileSelector::kMAX); | ||
| if (per_dim.empty()) { | ||
| per_dim.resize(dmin.nbDims); | ||
| } | ||
| for (int d = 0; d < dmin.nbDims; ++d) { | ||
| per_dim[d].push_back(std::make_pair(dmin.d[d], dmax.d[d])); | ||
| } | ||
| } | ||
| // Keep only dims that vary within a profile (min != max) or differ across | ||
| // profiles; a dim that is the same fixed extent in every profile cannot | ||
| // distinguish profiles, so we skip it (TRT validates it at setInputShape). | ||
| auto& dynamic_dims = profile_dynamic_dims[name]; | ||
| for (int32_t d = 0; d < static_cast<int32_t>(per_dim.size()); ++d) { | ||
| const auto& ranges = per_dim[d]; | ||
| bool is_dynamic = false; | ||
| for (const auto& r : ranges) { | ||
| if (r.first != r.second || r != ranges[0]) { | ||
| is_dynamic = true; | ||
| break; | ||
| } | ||
| } | ||
| if (is_dynamic) { | ||
| dynamic_dims.push_back({d, ranges}); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void TRTEngine::set_active_profile(int64_t profile_index) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kind of feel like this function should take stream as argument and put it on the caller to give the right stream instead of getting the current stream in the function body |
||
| if (num_optimization_profiles <= 1) { | ||
| return; | ||
| } | ||
| if (profile_index == active_profile_index) { | ||
| return; | ||
| } | ||
| // set the optimization on the default stream so that the enqueue stream will sync with it before running the engine | ||
| auto stream = c10::cuda::getCurrentCUDAStream(device_info.id); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work with the green context pr? |
||
| // setOptimizationProfileAsync returns false for an out-of-range index; the | ||
| // index is validated upstream in TorchTensorRTModule.resolve_profile_index. | ||
| TORCHTRT_CHECK( | ||
| exec_ctx->setOptimizationProfileAsync(static_cast<int32_t>(profile_index), stream.stream()), | ||
| "Failed to switch to optimization profile index " << profile_index); | ||
| stream.synchronize(); | ||
| active_profile_index = profile_index; | ||
| // A profile switch invalidates any captured CUDA graph and changes the | ||
| // context state, so force re-record / shape re-inference on the next call. | ||
| runtime_states.context_changed = true; | ||
| reset_captured_graph(); | ||
| shape_key = "None"; | ||
| LOG_DEBUG("Switched to optimization profile index " << profile_index); | ||
| } | ||
|
|
||
| int64_t TRTEngine::auto_select_profile(const std::vector<at::Tensor>& inputs) { | ||
| // Lazy selection: scan profiles in index order and return the first one whose | ||
| // [min, max] ranges contain every input's dynamic dims. Static dims are not | ||
| // cached (they cannot distinguish profiles) and are validated later by TRT at | ||
| // setInputShape. | ||
| for (int64_t p = 0; p < num_optimization_profiles; ++p) { | ||
| bool fits = true; | ||
| for (size_t i = 0; i < in_binding_names.size() && fits; ++i) { | ||
| const auto& name = in_binding_names[i]; | ||
| if (i >= inputs.size() || is_shape_inference_io[name]) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this function doesnt make a ton of sense and could potentially lead to thrashing. Fundamentally in the auto mode I think first we should see if the active profile is valid for the set of inputs. The mechanism we use to map from the list to an input binding name should be identical to the code we use in execute_engine. Its fundamentally the same job. In fact we should only do it once and reuse this mapping result. There should be some map from index in the input list to a name. If the inputs fit we should short cut and return. I also think because of this the responsibility of changing the opt profile should be on these methods and not the caller. That way we can just no-op if it fits. |
||
| continue; | ||
| } | ||
| auto dims_it = profile_dynamic_dims.find(name); | ||
| if (dims_it == profile_dynamic_dims.end()) { | ||
| continue; | ||
| } | ||
| auto sizes = inputs[i].sizes(); | ||
| for (const auto& dyn : dims_it->second) { | ||
| if (dyn.dim_index < static_cast<int32_t>(sizes.size())) { | ||
| int64_t lo = dyn.profile_ranges[p].first; | ||
| int64_t hi = dyn.profile_ranges[p].second; | ||
| int64_t sz = sizes[dyn.dim_index]; | ||
| if (!(lo <= sz && sz <= hi)) { | ||
| fits = false; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| if (fits) { | ||
| return p; | ||
| } | ||
| } | ||
| TORCHTRT_THROW_ERROR( | ||
| "No optimization profile matches the input shapes. Fix the input shapes or pin a profile " | ||
| "explicitly via optimization_profile(module, index)."); | ||
| return 0; // unreachable | ||
| } | ||
|
|
||
| void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) { | ||
| if (new_strategy != this->resource_allocation_strategy) { | ||
| this->resource_allocation_strategy = new_strategy; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What not do this first to short cut everything?