pytorch · robert-kalmar · Jun 23, 2026 · Mar 24, 2026 · Jun 10, 2026 · Jun 15, 2026
@@ -73,22 +73,28 @@ class EdgeProgramToIRConverter:
     _default_target_spec = NeutronTargetSpec("imxrt700")
     _default_delegation_options = CustomDelegationOptions()
 
+    def __init__(self):
+        self.edge_to_tflite_map = {}
+
     def convert_program(
         self,
         edge_program: ExportedProgram,
         conversion_config: ConversionConfig = _default_conversion_config,
         neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
-    ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]:
+    ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]:
         """
         Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes.
 
         :param edge_program: Converter ExportedProgram.
         :param conversion_config: ConversionConfig instance.
         :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
-        :return: TFLite flatbuffers as bytes.
+        :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping.
         """
+        # Reset the edge to tflite map for each conversion
+        self.edge_to_tflite_map = {}
+
         parameters_mapping = self.map_inputs_to_parameters(edge_program)
         dim_order_map = self.map_nodes_to_dim_order(edge_program)
 
@@ -112,14 +118,17 @@ def convert_program(
         # Apply optimizations and finalize the model.
         internal_tflite_model = cc.tflite_builder.finish()
 
+        # Get the final edge to tflite mapping after optimization
+        self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map
+
         # Extract the formats of the model's inputs and outputs.
         io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature)
 
         # TFLite model generation
         flatbuffers_builder = flatbuffers.Builder()
         internal_tflite_model.gen_tflite(flatbuffers_builder)
 
-        return bytes(flatbuffers_builder.Output()), io_formats
+        return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map
 
     @staticmethod
     def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext):
@@ -161,7 +170,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
             exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
-
         for node in nodes:
             if node.op == "call_function":
                 if node.target in qdq_related_functions and "cluster" in node.meta:
@@ -173,7 +181,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
                     # The node was already processed alongside the Q/DQ ops.
                     pass
                 elif node.target in functions_converters:
+                    # Get TFLite op count BEFORE conversion
+                    tflite_op_count_before = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+                    # Convert the node
                     functions_converters[node.target](conversion_context).convert(node)
+                    # Get TFLite op count AFTER conversion
+                    tflite_op_count_after = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+
+                    # Track the mapping - store edge debug handle in operators.
+                    # Get the edge debug handle so it can be associated with newly created operators.
+                    edge_debug_handle = node.meta.get("debug_handle", None)
+                    if (
+                        edge_debug_handle is not None
+                        and tflite_op_count_after > tflite_op_count_before
+                    ):
+                        operators = (
+                            conversion_context.tflite_builder.get_operators().vector
+                        )
+                        # Node converters append new operators to the TFLite builder.
+                        # Only operators added during this conversion step (from "before" to "after")
+                        # are tagged with the current edge_debug_handle.
+                        for i in range(tflite_op_count_before, tflite_op_count_after):
+                            # Store edge debug handle in operator's temporary attribute
+                            operators[i].tmp_edge_debug_handle = edge_debug_handle
+                        logger.d(
+                            f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'"
+                        )
+
                 else:
                     logger.e(
                         logger.Code.NOT_IMPLEMENTED,

@@ -85,6 +85,10 @@ class ModelBuilder:
 
     conversion_config: ConversionConfig
 
+    edge_to_tflite_map: dict[
+        int, tuple[int, ...]
+    ]  # Mapping edge debug handles to tuple of TFLite operator indices
+
     _default_conversion_config = ConversionConfig()
 
     def __init__(
@@ -105,6 +109,7 @@ def __init__(
         self._nchw_tensor_version = {}
         self._skipped_output_map = {}
         self._zeros_tensor_map = {}
+        self.edge_to_tflite_map = {}
 
     def create_zeros_tensor(
         self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False
@@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model:
             self.conversion_config.optimization_blacklist,
         )
 
+        # Create the final edge-to-tflite mapping after model optimization
+        self._create_edge_to_tflite_mapping()
+
         self._keep_one_empty_buffer()
 
         # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
@@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
+    def _create_edge_to_tflite_mapping(self):
+        """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable.
+
+        This function should be called after all model optimizations have been applied to match the output TFLite model.
+        """
+
+        edge_to_tflite_dict = {}
+        for idx, op in enumerate(self.get_operators().vector):
+            if (
+                hasattr(op, "tmp_edge_debug_handle")
+                and op.tmp_edge_debug_handle is not None
+            ):
+                debug_handle = op.tmp_edge_debug_handle
+                if debug_handle not in edge_to_tflite_dict:
+                    edge_to_tflite_dict[debug_handle] = []
+                edge_to_tflite_dict[debug_handle].append(idx)
+
+        # Convert lists to tuples in the dictionary
+        self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()}
+        logger.i(
+            f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}"
+        )
+
     def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:

@@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject):
     # If `True`, this is an extra operator added during conversion. It was not present in the original input model.
     tmp_added_extra: bool
 
+    # Edge program debug handle for mapping edge nodes to TFLite operators
+    tmp_edge_debug_handle: Optional[int]
+
     def __init__(
         self,
         inputs: OperatorInputs = None,
@@ -541,6 +544,8 @@ def __init__(
         self.tmp_version = 1
         self.tmp_added_extra = False
 
+        self.tmp_edge_debug_handle = None
+
     def uses_per_channel_quantization(self) -> bool:
         """Determine if this operator uses per-channel quantization."""
         for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs):

@@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts):
     cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[
         "dumpKernelSelectionCode"
     ]
+    if (
+        hasattr(cctx.compilationOpts, "useProfiling")
+        and compilation_opts["useProfiling"]
+    ):
+        cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"]
+        cctx.compilationOpts.dumpAfterImport = "console"
+        cctx.compilationOpts.dumpAfterGenerate = "console"
+        cctx.compilationOpts.verbose = compilation_opts["useProfiling"]
+
     return cctx
 
 
@@ -81,6 +90,7 @@ def convert(
         target: str,
         delegation_tag: str,
         fetch_constants_to_sram: bool = False,
+        use_profiling: bool = False,
     ) -> bytes:
         """
         Call Neutron Converter.
@@ -89,6 +99,7 @@ def convert(
         :param target: The target platform.
         :param delegation_tag: The delegation tag of model partition.
         :param fetch_constants_to_sram: Add microcode that fetches weights from external memory.
+        :param use_profiling: Use profiling for neutron delegated model.
         This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers).
 
         :return: TFLite model with Neutron microcode as bytes.
@@ -102,6 +113,7 @@ def convert(
             "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose",
             "fetchConstantsToSRAM": fetch_constants_to_sram,
             "dumpKernelSelectionCode": self.dump_kernel_selection_code,
+            "useProfiling": use_profiling,
         }
 
         # Try to use multiprocessing for isolation, but fall back to direct execution