diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 6ec8ee80688..d9b1d9cff43 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -73,13 +73,16 @@ class EdgeProgramToIRConverter:
     _default_target_spec = NeutronTargetSpec("imxrt700")
     _default_delegation_options = CustomDelegationOptions()
 
+    def __init__(self):
+        self.edge_to_tflite_map = {}
+
     def convert_program(
         self,
         edge_program: ExportedProgram,
         conversion_config: ConversionConfig = _default_conversion_config,
         neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
-    ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]:
+    ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]:
         """
         Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes.
 
@@ -87,8 +90,11 @@ def convert_program(
         :param conversion_config: ConversionConfig instance.
         :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
-        :return: TFLite flatbuffers as bytes.
+        :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping.
         """
+        # Reset the edge to tflite map for each conversion
+        self.edge_to_tflite_map = {}
+
         parameters_mapping = self.map_inputs_to_parameters(edge_program)
         dim_order_map = self.map_nodes_to_dim_order(edge_program)
 
@@ -112,6 +118,9 @@ def convert_program(
         # Apply optimizations and finalize the model.
         internal_tflite_model = cc.tflite_builder.finish()
 
+        # Get the final edge to tflite mapping after optimization
+        self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map
+
         # Extract the formats of the model's inputs and outputs.
         io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature)
 
@@ -119,7 +128,7 @@ def convert_program(
         flatbuffers_builder = flatbuffers.Builder()
         internal_tflite_model.gen_tflite(flatbuffers_builder)
 
-        return bytes(flatbuffers_builder.Output()), io_formats
+        return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map
 
     @staticmethod
     def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext):
@@ -161,7 +170,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
             exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
-
         for node in nodes:
             if node.op == "call_function":
                 if node.target in qdq_related_functions and "cluster" in node.meta:
@@ -173,7 +181,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
                     # The node was already processed alongside the Q/DQ ops.
                     pass
                 elif node.target in functions_converters:
+                    # Get TFLite op count BEFORE conversion
+                    tflite_op_count_before = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+                    # Convert the node
                     functions_converters[node.target](conversion_context).convert(node)
+                    # Get TFLite op count AFTER conversion
+                    tflite_op_count_after = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+
+                    # Track the mapping - store edge debug handle in operators.
+                    # Get the edge debug handle so it can be associated with newly created operators.
+                    edge_debug_handle = node.meta.get("debug_handle", None)
+                    if (
+                        edge_debug_handle is not None
+                        and tflite_op_count_after > tflite_op_count_before
+                    ):
+                        operators = (
+                            conversion_context.tflite_builder.get_operators().vector
+                        )
+                        # Node converters append new operators to the TFLite builder.
+                        # Only operators added during this conversion step (from "before" to "after")
+                        # are tagged with the current edge_debug_handle.
+                        for i in range(tflite_op_count_before, tflite_op_count_after):
+                            # Store edge debug handle in operator's temporary attribute
+                            operators[i].tmp_edge_debug_handle = edge_debug_handle
+                        logger.d(
+                            f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'"
+                        )
+
                 else:
                     logger.e(
                         logger.Code.NOT_IMPLEMENTED,
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index f97a194ce87..41820c3ab61 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -85,6 +85,10 @@ class ModelBuilder:
 
     conversion_config: ConversionConfig
 
+    edge_to_tflite_map: dict[
+        int, tuple[int, ...]
+    ]  # Mapping edge debug handles to tuple of TFLite operator indices
+
     _default_conversion_config = ConversionConfig()
 
     def __init__(
@@ -105,6 +109,7 @@ def __init__(
         self._nchw_tensor_version = {}
         self._skipped_output_map = {}
         self._zeros_tensor_map = {}
+        self.edge_to_tflite_map = {}
 
     def create_zeros_tensor(
         self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False
@@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model:
             self.conversion_config.optimization_blacklist,
         )
 
+        # Create the final edge-to-tflite mapping after model optimization
+        self._create_edge_to_tflite_mapping()
+
         self._keep_one_empty_buffer()
 
         # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
@@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
+    def _create_edge_to_tflite_mapping(self):
+        """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable.
+
+        This function should be called after all model optimizations have been applied to match the output TFLite model.
+        """
+
+        edge_to_tflite_dict = {}
+        for idx, op in enumerate(self.get_operators().vector):
+            if (
+                hasattr(op, "tmp_edge_debug_handle")
+                and op.tmp_edge_debug_handle is not None
+            ):
+                debug_handle = op.tmp_edge_debug_handle
+                if debug_handle not in edge_to_tflite_dict:
+                    edge_to_tflite_dict[debug_handle] = []
+                edge_to_tflite_dict[debug_handle].append(idx)
+
+        # Convert lists to tuples in the dictionary
+        self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()}
+        logger.i(
+            f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}"
+        )
+
     def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:
diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
index 6e8e7b6c33b..d8d0bada57d 100755
--- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py
+++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
@@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject):
     # If `True`, this is an extra operator added during conversion. It was not present in the original input model.
     tmp_added_extra: bool
 
+    # Edge program debug handle for mapping edge nodes to TFLite operators
+    tmp_edge_debug_handle: Optional[int]
+
     def __init__(
         self,
         inputs: OperatorInputs = None,
@@ -541,6 +544,8 @@ def __init__(
         self.tmp_version = 1
         self.tmp_added_extra = False
 
+        self.tmp_edge_debug_handle = None
+
     def uses_per_channel_quantization(self) -> bool:
         """Determine if this operator uses per-channel quantization."""
         for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs):
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 0abee0cdc86..92b4e25a5de 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts):
     cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[
         "dumpKernelSelectionCode"
     ]
+    if (
+        hasattr(cctx.compilationOpts, "useProfiling")
+        and compilation_opts["useProfiling"]
+    ):
+        cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"]
+        cctx.compilationOpts.dumpAfterImport = "console"
+        cctx.compilationOpts.dumpAfterGenerate = "console"
+        cctx.compilationOpts.verbose = compilation_opts["useProfiling"]
+
     return cctx
 
 
@@ -81,6 +90,7 @@ def convert(
         target: str,
         delegation_tag: str,
         fetch_constants_to_sram: bool = False,
+        use_profiling: bool = False,
     ) -> bytes:
         """
         Call Neutron Converter.
@@ -89,6 +99,7 @@ def convert(
         :param target: The target platform.
         :param delegation_tag: The delegation tag of model partition.
         :param fetch_constants_to_sram: Add microcode that fetches weights from external memory.
+        :param use_profiling: Use profiling for neutron delegated model.
         This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers).
 
         :return: TFLite model with Neutron microcode as bytes.
@@ -102,6 +113,7 @@ def convert(
             "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose",
             "fetchConstantsToSRAM": fetch_constants_to_sram,
             "dumpKernelSelectionCode": self.dump_kernel_selection_code,
+            "useProfiling": use_profiling,
         }
 
         # Try to use multiprocessing for isolation, but fall back to direct execution
diff --git a/backends/nxp/backend/neutron_map.py b/backends/nxp/backend/neutron_map.py
new file mode 100644
index 00000000000..e2da653daa3
--- /dev/null
+++ b/backends/nxp/backend/neutron_map.py
@@ -0,0 +1,457 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import re
+from dataclasses import dataclass
+
+# example:  Type: CONV_2D
+#               Inputs:
+#                 [0]: quantized_decomposed_quantize_per_tensor_default_4
+#                 [1]: quantized_decomposed_dequantize_per_channel_default_2
+#               Outputs:
+#                 [0]: quantized_decomposed_quantize_per_tensor_default_5
+#               Location: 4
+PATTERN_NODE = (
+    r"Type:\s+(?P<type>\w+)\s+"
+    r"Inputs:(?P<inputs>[\s\S]*?)"
+    r"Outputs:(?P<outputs>[\s\S]*?)"
+    r"Location:\s+(?P<location>\d+)"
+)
+# The pattern is very similar to operator pattern
+PATTERN_SUBGRAPH = (
+    r"^(?P<num>\d+)\s*"
+    r"Inputs:(?P<inputs>[\s\S]*?)"
+    r"Outputs:(?P<outputs>[\s\S]*?)"
+    r"Tensors:"
+)
+# example:  [0]: quantized_decomposed_quantize_per_tensor_default_4
+PATTERN_IO_TENSOR_NAME = r"\[\d+\]:\s+(?P<name>[\S]+)"
+# example: Statistics for NeutronGraph "subgraph_195":
+PATTERN_GRAPH = r"Statistics for NeutronGraph \"subgraph_(?P<num>\d+)\":"
+# example:      NeutronOperator "subgraph_001":
+#                       Operators:
+#                           PAD
+#                           CONV_2D
+#                       Kernels:
+#                           Pad
+#                           Conv2DStandardV2
+#               NeutronOperator "subgraph_002":
+PATTERN_VERBOSE_KERNELS = (
+    r"\"subgraph_(?P<subgraph>\d+)\"\:\s*"
+    r"Operators:[\s\S]*?"
+    r"Kernels:\s*(?P<kernels>[\s\S]*?)"
+    r"\s*(NeutronOperator|^$|=)"
+)
+# example:  NeutronGraph "subgraph_074":
+PATTERN_VERBOSE_GRAPH = (
+    r"NeutronGraph\s*\"subgraph_(?P<subgraph>\d+)\":(?P<operators>[\s\S]*?)\s*(^$|=)"
+)
+# Two graphs are expected in the input log: original and converted.
+EXPECTED_GRAPHS = 2
+# List of single-input nodes that shouldn't be mapped on the same TFLite node.
+SINGLE_INPUT_NODES = [
+    "ABS",
+    "AVERAGE_POOL_2D",
+    "CAST",
+    "EXP",
+    "HARD_SWISH",
+    "LEAKY_RELU",
+    "LOG",
+    "LOGISTIC",
+    "MAX_POOL_2D",
+    "QUANTIZE",
+    "RSQRT",
+    "TANH",
+]
+
+
+@dataclass
+class Node:
+    name: str  # Name of the node.
+    inputs: list[str]  # List of nodes inputs.
+    outputs: list[str]  # List of nodes outputs.
+    location: int  # Location in graph/subgraph.
+
+
+@dataclass
+class SubgraphInfo:
+    num: int  # Subgraph number.
+    location: int  # Location in neutron graph
+    inputs: list[str]  # List of subgraphs inputs.
+    outputs: list[str]  # List of subgraphs outputs.
+    kernels: int  # Number of neutron kernels in neutron subgraph.
+    nodes: list[Node]  # List of tflite nodes in neutron subgraph.
+
+
+def get_tensors_name(tensors: str) -> list[str]:
+    """Split input string with tensor names into list of names"""
+    return [m.group("name") for m in re.finditer(PATTERN_IO_TENSOR_NAME, tensors)]
+
+
+class NeutronMap:
+    """Mapping between Neutron, TFLite, and Edge operators based on the Neutron converter log.
+
+    Parses the Neutron converter log to extract information about TFLite nodes and Neutron subgraphs.
+    Maps TFLite operators to corresponding Neutron operators.
+    Maps Edge operators to Neutron operators via the Edge-to-TFLite mapping.
+
+    Attributes:
+        tflite_nodes (list[Node]): TFLite node information extracted from the converter log.
+        neutron_subgraphs (list[SubgraphInfo]): Neutron subgraph information extracted from the converter log.
+        neutron_graphs (list[int]): Indices of final Neutron graphs derived from neutron_subgraphs.
+        edge_to_tflite_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to TFLite operators.
+        edge_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to Neutron operators.
+        tflite_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from TFLite operators to Neutron operators.
+
+    Example:
+        >>> map = NeutronMap(log_output, edge_to_tflite_map)
+        >>> neutron_to_edge_map = map.get_neutron_to_edge_map()
+    """
+
+    tflite_nodes: list[Node]
+    neutron_subgraphs: list[SubgraphInfo]
+    neutron_graphs: list[int]
+    edge_to_tflite_map: dict[int, tuple[int, ...]]
+    edge_to_neutron_map: dict[int, tuple[int, ...]]
+    tflite_to_neutron_map: dict[int, tuple[int, ...]]
+
+    def __init__(
+        self, neutron_converter_log: str, edge_to_tflite_map: dict[int, tuple[int, ...]]
+    ) -> None:
+        """Initialize neutron map from neutron converter log.
+
+        :param neutron_converter_log: neutron converter log obtained during model conversion. It should contain
+        original tflite graph and neutron graph dump. To add these dumps to converter log the dumpAfterImport and
+        dumpAfterGenerate flags have to be set to "console".
+        """
+        super().__init__()
+        self.tflite_nodes = []
+        self.neutron_subgraphs = []
+        self.neutron_graphs = []
+        self.edge_to_tflite_map = edge_to_tflite_map
+        self.tflite_to_neutron_map = {}
+        self.edge_to_neutron_map = {}
+        self.neutron_kernels_num = 0
+        self._split_profiling_log(neutron_converter_log)
+
+    def _split_profiling_log(self, log: str) -> None:
+        """Process profiling log to split it into original TFLite and converted Neutron nodes.
+
+        :param log: Neutron converter log obtained during model conversion, containing the original
+            TFLite graph and Neutron graph dump.
+        :return: None. Sets class attributes tflite_nodes and neutron_subgraphs with node information.
+        """
+        graphs = log.split("Graphs:")
+        # Check if there is two graphs in the input dump
+        if len(graphs) != EXPECTED_GRAPHS + 1:
+            return
+        optimization_dump, neutron_graph_dump = graphs[1:]
+
+        # Get tflite model dump
+        tflite_graph_dump = optimization_dump.partition("= Optimize Graph =")[0]
+
+        # Get verbose Neutron graphs located in the Extract Graphs section.
+        extracted_graph_dump = optimization_dump.partition("= Extract Graphs =")[
+            2
+        ].partition("Generate code for NeutronGraph")[0]
+
+        # Get list of original operators from first dumped graph.
+        self.tflite_nodes = [
+            Node(
+                matched_operator.group("type"),
+                get_tensors_name(matched_operator.group("inputs")),
+                get_tensors_name(matched_operator.group("outputs")),
+                int(matched_operator.group("location")),
+            )
+            for matched_operator in re.finditer(PATTERN_NODE, tflite_graph_dump)
+        ]
+        # Get list of neutron subgraphs.
+        self.neutron_subgraphs = self._get_neutron_subgraphs(neutron_graph_dump)
+        if self.neutron_subgraphs:
+            self._update_neutron_subgraphs_info(extracted_graph_dump)
+
+    def _get_neutron_subgraphs(self, graph_dump: str) -> list[SubgraphInfo]:
+        """Parse Neutron graph dump and extract subgraph information.
+
+        :param graph_dump: String containing the Neutron graph dump from the converter log.
+        :return: List of SubgraphInfo objects containing subgraph metadata and operator nodes.
+        """
+
+        def get_subgraph_nodes(subrgraph_dump: str) -> list[Node]:
+            """Parse subgraph dump and extract operator nodes.
+
+            :param subgraph_dump: String containing a single Neutron subgraph definition.
+            :return: List of Node objects representing operators in the subgraph.
+            """
+            return [
+                Node(
+                    matched_operator.group("type"),
+                    get_tensors_name(matched_operator.group("inputs")),
+                    get_tensors_name(matched_operator.group("outputs")),
+                    int(matched_operator.group("location")),
+                )
+                for matched_operator in re.finditer(PATTERN_NODE, subrgraph_dump)
+            ]
+
+        subgraphs = graph_dump.split(r"Name: subgraph_")
+        if len(subgraphs) < 3:
+            return []
+
+        # Get numbers of final neutron graphs in converted model.
+        self.neutron_graphs = [
+            int(matched_graphs.group("num"))
+            for matched_graphs in re.finditer(PATTERN_GRAPH, subgraphs[-1])
+        ]
+        if not self.neutron_graphs:
+            return []
+
+        # Get subgraphs
+        neutron_subgraphs: list[SubgraphInfo] = []
+        for subgraph in subgraphs[1:]:
+            subgraph_match = re.search(PATTERN_SUBGRAPH, subgraph)
+            if not subgraph_match:
+                continue
+            neutron_subgraph = SubgraphInfo(
+                int(subgraph_match.group("num")),
+                -1,
+                get_tensors_name(subgraph_match.group("inputs")),
+                get_tensors_name(subgraph_match.group("outputs")),
+                0,
+                get_subgraph_nodes(subgraph),
+            )
+            neutron_subgraphs.append(neutron_subgraph)
+        return neutron_subgraphs
+
+    def _update_neutron_subgraphs_info(self, extracted_graph: str) -> None:
+        """Update Neutron subgraphs with verbose info.
+
+        - Set numbers of Neutron kernels in each Neutron subgraph. 99% of subgraphs contain only one Neutron kernel,
+        but there are some exceptions and some subgraphs can have more kernels. This number can be taken from
+        final Neutron graph info.
+        - Set Neutron subgraphs location in the final Neutron Graph. The function updates the location parameter
+        for each Neutron subgraph according to its position in the final Neutron graph. Location is calculated
+        continuously across all Neutron graphs in the model. Non-Neutron operators are skipped.
+
+        :param extracted_graph: verbose Neutron graph dump.
+        """
+        # Neutron graphs.
+        neutron_graphs = extracted_graph.split("NeutronGraph")
+        location_shift = 0
+        for neutron_graph in neutron_graphs:
+
+            subgraph_nodes = {
+                int(matched_subgraph.group("subgraph")): {
+                    "location": i + location_shift,
+                    "kernels": [
+                        kernel.replace(" ", "")
+                        for kernel in matched_subgraph.group("kernels").split("\n")
+                        if kernel.strip()
+                    ],
+                }
+                for i, matched_subgraph in enumerate(
+                    re.finditer(PATTERN_VERBOSE_KERNELS, neutron_graph)
+                )
+            }
+            if not subgraph_nodes:
+                continue
+            # Update location offset according to the number of kernels in the subgraph.
+            location_shift += len(subgraph_nodes)
+
+            # Neutron graphs.
+            graph_num = -1
+            matched_graph = re.search(r"subgraph_(?P<subgraph>\d+)", neutron_graph)
+            if matched_graph:
+                graph_num = int(matched_graph.group("subgraph"))
+
+            # Update number of kernels for all subgraphs.
+            for subgraph in self.neutron_subgraphs:
+                if subgraph.num in subgraph_nodes:
+                    subgraph.kernels = len(subgraph_nodes[subgraph.num]["kernels"])
+                    subgraph.location = subgraph_nodes[subgraph.num]["location"]
+                elif subgraph.num == graph_num:
+                    subgraph.kernels = sum(
+                        len(s["kernels"]) for s in subgraph_nodes.values()
+                    )
+                    self.neutron_kernels_num += subgraph.kernels
+
+    def _nodes_match_by_io(self, tf_node: Node, neutron_node: Node) -> bool:
+        """
+        Determine whether a TFLite node can be mapped to a Neutron node
+        based on their input and output compatibility.
+
+        :param tf_node: Source TFLite node.
+        :param neutron_node: Target Neutron node.
+        :return: True if the nodes can be considered mapped, False otherwise.
+        """
+
+        def get_name_matches(tf_names: list[str], neutron_names: list[str]) -> int:
+            # Count how many names from tf_names have a corresponding match in
+            # neutron_names. A match is defined as:
+            #   - exact equality, or
+            #   - one name being a hierarchical variant of the other
+            #     (i.e., sharing a common prefix separated by "/").
+            result = 0
+            for tf_name in tf_names:
+                # Determine if the tensor name corresponds to a special operation input.
+                # Matches names like "perm0", "perm1", etc. used by Transpose ops,
+                # and names like "padding0", "padding1", etc. used by Pad ops.
+                special_op = (
+                    "permutation"
+                    if re.fullmatch(r"perm(\d+)?", tf_name)
+                    else (
+                        "padding"
+                        if re.fullmatch(r"padding(s)?(\d+)?", tf_name)
+                        else None
+                    )
+                )
+                for neutron_name in neutron_names:
+                    if (
+                        neutron_name == tf_name
+                        or neutron_name + "/" in tf_name
+                        or tf_name + "/" in neutron_name
+                    ):
+                        result += 1
+                        break
+
+                    # Check if the neutron input is also the special op (Pad or Transpose)
+                    if special_op and special_op in neutron_name:
+                        result += 1
+                        break
+            return result
+
+        name_matches = get_name_matches(tf_node.inputs, neutron_node.inputs)
+        # Map the node if all TFLite inputs match Neutron inputs.
+        # Note: the Neutron node may still have additional extra inputs.
+        if name_matches == len(tf_node.inputs):
+            return True
+        elif name_matches == len(tf_node.inputs) - 1:
+            # If there is only one unmatched input, check matching of outputs.
+            name_matches = get_name_matches(tf_node.outputs, neutron_node.outputs)
+            if name_matches == len(tf_node.outputs):
+                # Map the node if all TFLite outputs match Neutron outputs.
+                return True
+        return False
+
+    def get_tflite_to_neutron_map(self) -> dict[int, tuple[int, ...]]:
+        """Map TFLite nodes from the original model to Neutron nodes in the converted model.
+
+        The mapping is built based on input and output tensor names. Neutron tensors may have
+        exactly the same names or use the format "tflite_input/additional_name".
+
+        :return: Dictionary mapping TFLite node indices to tuple of Neutron subgraph indices.
+        """
+        tflite_to_neutron_dict = {}
+        for tf_idx, tf_node in enumerate(self.tflite_nodes):
+            subgraph_idxs = []
+            for subgraph in self.neutron_subgraphs:
+                if (
+                    subgraph.num in self.neutron_graphs
+                    or subgraph.location in subgraph_idxs
+                ):
+                    continue
+                for neutron_node in subgraph.nodes:
+                    if self._nodes_match_by_io(tf_node, neutron_node):
+                        subgraph_idxs.append(subgraph.location)
+                        break
+            # Filter subgraph_idxs to avoid mapping multiple parallel single-input nodes that consume the
+            # same input tensor into the same TFLite node.
+            subgraph_idxs = self._filter_single_input_nodes(tf_node.name, subgraph_idxs)
+            if subgraph_idxs:
+                tflite_to_neutron_dict[tf_idx] = tuple(subgraph_idxs)
+
+        self.tflite_to_neutron_map = tflite_to_neutron_dict
+        return self.tflite_to_neutron_map
+
+    def _filter_single_input_nodes(
+        self, node_name: str, subgraph_loc: list[int]
+    ) -> list[int]:
+        """
+        Filter the Neutron-to-TFLite mapping to avoid mapping multiple parallel single-input nodes
+        that consume the same input tensor to a single TFLite node.
+
+        The function checks whether the current TFLite node is a supported single-input node
+        (as defined in SINGLE_INPUT_NODES) and whether it is mapped to multiple Neutron nodes.
+        In such cases, it is possible that parallel single-input Neutron nodes were incorrectly
+        mapped to the same TFLite node.
+
+        If more than one single-input Neutron node is mapped, only one is kept in the mapping:
+        the Neutron node whose operation name matches the operation name of the current TFLite node.
+
+        :param node_name: Operation name of the current TFLite node.
+        :param subgraph_loc: List of Neutron subgraph indices whose inputs correspond to the
+                            input of the current TFLite node.
+        :return: Filtered list of Neutron subgraph indices to be mapped to the current TFLite node.
+        """
+        # Check if there can be potential issue in mapping.
+        if node_name in SINGLE_INPUT_NODES and len(subgraph_loc) > 1:
+            single_in_nodes = []
+            # Find all single-input nodes in subgraph_idxs.
+            subgraphs = (
+                subgraph
+                for subgraph in self.neutron_subgraphs
+                if subgraph.location in subgraph_loc
+            )
+            for subgraph in subgraphs:
+                for neutron_node in subgraph.nodes:
+                    if neutron_node.name in SINGLE_INPUT_NODES:
+                        single_in_nodes.append((subgraph.location, neutron_node.name))
+            if len(single_in_nodes) > 0:
+                # Keep only the node with the matching name when multiple single-input nodes are present in subgraph_idxs.
+                for subgraph_id, single_in_node_name in single_in_nodes:
+                    if single_in_node_name == node_name:
+                        return [subgraph_id]
+                return []
+        return subgraph_loc
+
+    def get_edge_to_neutron_map(self) -> dict[int, tuple[int, ...]]:
+        """Map Edge nodes to Neutron nodes.
+
+        :return: Dictionary mapping Edge node handles to tuple of Neutron subgraph indices.
+        """
+        self.get_tflite_to_neutron_map()
+        edge_to_neutron_dict = {}
+
+        for edge_handle, tflite_indices in self.edge_to_tflite_map.items():
+            neutron_nodes = set()
+            for tf_node in tflite_indices:
+                if tf_node in self.tflite_to_neutron_map:
+                    neutron_nodes.update(self.tflite_to_neutron_map[tf_node])
+            if neutron_nodes:
+                edge_to_neutron_dict[edge_handle] = tuple(neutron_nodes)
+
+        self.edge_to_neutron_map = edge_to_neutron_dict
+        return self.edge_to_neutron_map
+
+    def get_neutron_to_edge_map(self) -> dict[int, tuple[int, ...]]:
+        """
+        Transform edge-to-neutron map to neutron-to-edge map.
+
+        :return: Dictionary mapping neutron_index to tuple of edge_handles
+        """
+        if not self.edge_to_neutron_map:
+            _ = self.get_edge_to_neutron_map()
+
+        neutron_to_edge = {}
+
+        for edge_handle, neutron_indices in self.edge_to_neutron_map.items():
+            for neutron_idx in neutron_indices:
+                if neutron_idx not in neutron_to_edge:
+                    neutron_to_edge[neutron_idx] = []
+                neutron_to_edge[neutron_idx].append(edge_handle)
+
+        # Fill gaps with empty tuples and convert lists to tuples.
+        if neutron_to_edge:
+            max_neutron_idx = self.neutron_kernels_num
+            result = {}
+            # Add one more non-mapped event at the end of list for the Neutron Dump event.
+            for i in range(max_neutron_idx + 1):
+                if i in neutron_to_edge:
+                    result[i] = tuple(neutron_to_edge[i])
+                else:
+                    result[i] = ()
+            logging.info(f"Neutron to Edge map was created: {result}")
+            return result
+        else:
+            return {}
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index 1a84a418e92..ee711c34369 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -11,6 +11,8 @@
 import logging
 import os
 import struct
+import tempfile
+from contextlib import contextmanager
 from typing import final
 
 import numpy as np
@@ -26,6 +28,8 @@
 from executorch.backends.nxp.backend.neutron_converter_manager import (
     NeutronConverterManager,
 )
+
+from executorch.backends.nxp.backend.neutron_map import NeutronMap
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.neutron_node_extraction import (
     extract_artifacts_from_neutron_node,
@@ -54,6 +58,7 @@ def __init__(self):
         self.use_neutron_for_format_conversion = True
         self.fetch_constants_to_sram = False
         self.dump_kernel_selection_code = False
+        self.use_profiling = False
 
     def _replace_colons(self, operator: str) -> str:
         """
@@ -70,6 +75,7 @@ def neutron_compile_spec(
         use_neutron_for_format_conversion: bool = True,
         fetch_constants_to_sram: bool = False,
         dump_kernel_selection_code: bool = False,
+        use_profiling: bool = False,
     ) -> "NeutronCompileSpecBuilder":
         """Generate compile spec for Neutron NPU
 
@@ -83,6 +89,7 @@ def neutron_compile_spec(
         :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights
                                      from FLASH to SRAM. This should be used when the whole model does not fit into SRAM.
         :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code.
+        :param use_profiling: If true Neutron Converter will enable profiling for neutron delegated model
         :return: self for method chaining
         """
 
@@ -106,6 +113,7 @@ def neutron_compile_spec(
         self.use_neutron_for_format_conversion = use_neutron_for_format_conversion
         self.fetch_constants_to_sram = fetch_constants_to_sram
         self.dump_kernel_selection_code = dump_kernel_selection_code
+        self.use_profiling = use_profiling
 
         return self
 
@@ -135,6 +143,10 @@ def build(self):
                     "dump_kernel_selection_code",
                     f"{self.dump_kernel_selection_code}".encode(),
                 ),
+                CompileSpec(
+                    "use_profiling",
+                    f"{self.use_profiling}".encode(),
+                ),
             ]
 
         return self.compile_spec
@@ -149,6 +161,7 @@ def generate_neutron_compile_spec(
     use_neutron_for_format_conversion: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
+    use_profiling: bool = False,
 ) -> list[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
@@ -160,11 +173,36 @@ def generate_neutron_compile_spec(
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
             fetch_constants_to_sram=fetch_constants_to_sram,
             dump_kernel_selection_code=dump_kernel_selection_code,
+            use_profiling=use_profiling,
         )
         .build()
     )
 
 
+@contextmanager
+def capture_fd_output():
+    tmp = tempfile.TemporaryFile()
+
+    # Save original stdout / stderr
+    original_stdout_fd = os.dup(1)
+    original_stderr_fd = os.dup(2)
+
+    try:
+        # Redirect fd=1 and fd=2 to temp file
+        os.dup2(tmp.fileno(), 1)
+        os.dup2(tmp.fileno(), 2)
+
+        yield tmp  # give access to the temp file
+
+    finally:
+        # Restore original fds
+        os.dup2(original_stdout_fd, 1)
+        os.dup2(original_stderr_fd, 2)
+
+        os.close(original_stdout_fd)
+        os.close(original_stderr_fd)
+
+
 @final
 class NeutronBackend(BackendDetails):
 
@@ -185,6 +223,7 @@ def preprocess(  # noqa C901
         use_neutron_for_format_conversion = None
         fetch_constants_to_sram = False
         dump_kernel_selection_code = None
+        use_profiling = False
         for spec in compile_spec:
             if spec.key == "output_format":
                 output_format = spec.value.decode()
@@ -200,6 +239,8 @@ def preprocess(  # noqa C901
                 fetch_constants_to_sram = spec.value.decode() == "True"
             if spec.key == "dump_kernel_selection_code":
                 dump_kernel_selection_code = spec.value.decode() == "True"
+            if spec.key == "use_profiling":
+                use_profiling = spec.value.decode() == "True"
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -229,19 +270,32 @@ def preprocess(  # noqa C901
                 if use_neutron_for_format_conversion is not None
                 else {}
             )
-            tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
+            (
+                tflite_model,
+                io_formats,
+                edge_to_tflite_map,
+            ) = EdgeProgramToIRConverter().convert_program(
                 edge_program,
                 neutron_target_spec=NeutronTargetSpec(target),
                 conversion_config=conversion_config,
                 custom_delegation_options=CustomDelegationOptions(),
             )
 
-            neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert(
-                tflite_model,
-                target,
-                delegation_tag,
-                fetch_constants_to_sram,
-            )
+            with capture_fd_output() as tmp:
+                neutron_model = NeutronConverterManager(
+                    dump_kernel_selection_code
+                ).convert(
+                    tflite_model,
+                    target,
+                    delegation_tag,
+                    fetch_constants_to_sram,
+                    use_profiling,
+                )
+                tmp.seek(0)
+                log_output = tmp.read().decode()
+            # Get mapping from tflite to neutron
+            map = NeutronMap(log_output, edge_to_tflite_map)
+            neutron_to_edge_map = map.get_neutron_to_edge_map()
 
             # Dump the tflite file if intermediates_dir is set
             if intermediates_dir != "None":
@@ -265,7 +319,9 @@ def preprocess(  # noqa C901
         else:
             raise RuntimeError(f"Unknown format {output_format}")
 
-        return PreprocessResult(processed_bytes=binary)
+        return PreprocessResult(
+            processed_bytes=binary, debug_handle_map=neutron_to_edge_map
+        )
 
 
 class PayloadComposer:
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
index 3ea973b7c5b..6fe0482ed89 100644
--- a/backends/nxp/runtime/NeutronBackend.cpp
+++ b/backends/nxp/runtime/NeutronBackend.cpp
@@ -10,6 +10,7 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 
 #include "NeutronDriver.h"
@@ -25,6 +26,8 @@ namespace neutron {
 #define ALIGN_SIZE(size) \
   ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1)))
 
+#define KOPC_CALLARGS 6 // The operation for TileIR
+
 // clang-format off
 /* Header schema:
      +----------------------------+-----------------------------+------------------------+
@@ -84,6 +87,19 @@ typedef struct {
   const uint8_t* outputMap;
 } NeutronExecutorchConfig;
 
+typedef struct {
+  uint8_t eventCode;
+  uint8_t opCode;
+  uint8_t functionCode;
+  uint8_t timestampCode;
+  uint32_t time;
+} NeutronSingleProfilingEvent;
+
+typedef struct {
+  NeutronSingleProfilingEvent startEvent;
+  NeutronSingleProfilingEvent stopEvent;
+} NeutronFullProfilingEvent;
+
 #ifdef EXTERNAL_MEM
 // Neutron compute has no access to FLASH.
 // Prefetch weights from FLASH to SRAM using memcpy.
@@ -508,12 +524,11 @@ class NeutronBackend final : public PyTorchBackendInterface {
       }
     }
 
-#ifdef NEUTRON_PROFILE
-    // TODO: Use trace from BackendExecutionContext.
-    NeutronTraceConfig trace_config{.traceConfig = 0};
-    neutronSetTrace(cfg->nmh, &trace_config);
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Save ticks before neutron compute to measure how much time profiling dump
+    // takes
+    et_timestamp_t start_ticks = ::executorch::runtime::pal_current_ticks();
 #endif
-
     // Run neutron compute.
     NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg);
     if (neutronRC != ENONE) {
@@ -523,6 +538,11 @@ class NeutronBackend final : public PyTorchBackendInterface {
           neutronRC);
       return Error::InvalidProgram;
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Save ticks after neutron compute to measure how much time profiling dump
+    // takes
+    et_timestamp_t stop_ticks = ::executorch::runtime::pal_current_ticks();
+#endif
 
     // Transpose outputs.
     for (int i = 0; i < cfg->numOutputs; i++) {
@@ -558,6 +578,53 @@ class NeutronBackend final : public PyTorchBackendInterface {
         }
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Add traced evens only if model has profiling info.
+    auto profile_size = cfg->profileSize;
+    if (profile_size > 0) {
+      int events_num = static_cast<int>(profile_size / 16);
+      auto profiling_index = cfg->numOutputs + 1;
+      char* profile_info =
+          static_cast<char*>(cfg->dcfg.outputs[profiling_index]);
+      NeutronFullProfilingEvent* neutron_events =
+          (NeutronFullProfilingEvent*)profile_info;
+      executorch::runtime::EventTracer* tracer = context.event_tracer();
+      uint32_t start_time = 0;
+      int index = 0;
+      // Post log neutron events from profiling output.
+      for (int i = 0; i < events_num; i++) {
+        if (start_time == 0) {
+          start_time = neutron_events[i].startEvent.time;
+        }
+        if (neutron_events[i].stopEvent.opCode != KOPC_CALLARGS) {
+          // Only KOPC_CALLARGS events can be mapped to original .pte model.
+          continue;
+        } else {
+          event_tracer_log_profiling_delegate(
+              tracer,
+              nullptr,
+              index,
+              start_time,
+              neutron_events[i].stopEvent.time,
+              static_cast<const void*>(
+                  &neutron_events[i].startEvent.functionCode),
+              sizeof(uint8_t));
+          start_time = 0;
+          index++;
+        }
+      }
+      event_tracer_log_profiling_delegate(
+          tracer,
+          nullptr,
+          index,
+          neutron_events[events_num - 1].startEvent.time,
+          neutron_events[events_num - 1].stopEvent.time + stop_ticks -
+              start_ticks,
+          static_cast<const void*>(
+              &neutron_events[events_num - 1].startEvent.functionCode),
+          sizeof(uint8_t));
+    }
+#endif
 
     return Error::Ok;
   }
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
index 44a96010593..1309e019428 100644
--- a/backends/nxp/tests/executorch_pipeline.py
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -190,6 +190,7 @@ def to_quantized_edge_program(
     use_quant_state_dict: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
+    use_profiling: bool = False,
     delegate_to_npu=True,
 ) -> EdgeProgramManager:
     _neutron_target_spec = NeutronTargetSpec(target)
@@ -223,6 +224,7 @@ def to_quantized_edge_program(
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         fetch_constants_to_sram=fetch_constants_to_sram,
         dump_kernel_selection_code=dump_kernel_selection_code,
+        use_profiling=use_profiling,
     )
     post_quant_state_dict = (
         exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None
@@ -244,6 +246,7 @@ def to_quantized_edge_program(
         export(exir_program_aten__module_quant, example_input, strict=True),
         transform_passes=NeutronEdgePassManager(),
         partitioner=partitioners,
+        generate_etrecord=use_profiling,
         compile_config=EdgeCompileConfig(
             _check_ir_validity=False,
             _core_aten_ops_exception_list=core_aten_ops_exception_list,
@@ -274,6 +277,7 @@ def to_quantized_executorch_program(
     use_neutron_for_format_conversion: bool = True,
     dataset_dir: str | None = None,
     delegate_to_npu=True,
+    use_profiling: bool = False,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ) -> ExecutorchProgramManager:
@@ -295,6 +299,7 @@ def to_quantized_executorch_program(
         train_fn=train_fn,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         delegate_to_npu=delegate_to_npu,
+        use_profiling=use_profiling,
         operators_not_to_delegate=operators_not_to_delegate,
         remove_quant_io_ops=remove_quant_io_ops,
         **get_calibration_inputs_fn,
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 319f372b5fa..94e91a31b95 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -325,7 +325,7 @@ def convert_run_compare(
 
     if tfl_model is None:
         NodeFormatInference(edge_program).identify_node_formats()
-        tfl_model, _ = EdgeProgramToIRConverter().convert_program(
+        tfl_model, *_ = EdgeProgramToIRConverter().convert_program(
             edge_program, conversion_config
         )
 
diff --git a/backends/nxp/tests/generic_tests/test_aot_example.py b/backends/nxp/tests/generic_tests/test_aot_example.py
index 893041fe372..8a1e5e49555 100644
--- a/backends/nxp/tests/generic_tests/test_aot_example.py
+++ b/backends/nxp/tests/generic_tests/test_aot_example.py
@@ -2,11 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
+import os
 import subprocess
 import sys
 from pathlib import Path
 
+from executorch.backends.nxp.tests.config_importer import test_config
+
 # noinspection PyProtectedMember
 from executorch.exir._serialize import _deserialize_pte_binary
 from executorch.exir.schema import DelegateCall, KernelCall
@@ -15,9 +17,8 @@
 def test_aot_example__mobilenet_v2():
     """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated."""
 
-    # Find the executorch root directory (5 levels up from this test file)
-    executorch_root = Path(__file__).parent.parent.parent.parent.parent
-    assert executorch_root.exists(), f"Executorch root not found at {executorch_root}"
+    # Set the executorch root directory.
+    executorch_root = test_config.PROJECT_DIR
 
     # Run the compilation script as a module (like run_aot_example.sh does)
     cmd = [
@@ -34,14 +35,14 @@ def test_aot_example__mobilenet_v2():
     ]
 
     # Output file will be created in executorch_root
-    pte_file = executorch_root / "mobilenetv2_nxp_delegate.pte"
+    pte_file = Path(os.path.join(executorch_root, "mobilenetv2_nxp_delegate.pte"))
 
     try:
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
-            timeout=300,  # 5 minute timeout just in case. On my machine, the test usually runs ~1 minute.
+            timeout=300,  # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute.
             cwd=str(
                 executorch_root
             ),  # Run from executorch root (like run_aot_example.sh)
@@ -95,3 +96,77 @@ def test_aot_example__mobilenet_v2():
         # Clean up the generated file
         if pte_file.exists():
             pte_file.unlink()
+
+
+def test_aot_example__mobilenet_v2__profiling():
+    """Test that mobilenet_v2 can be lowered to Neutron backend via `aot_neutron_compile.py`, all ops are delegated,
+    the output model is profilable and ETRecord is generated properly."""
+
+    # Set the executorch root directory.
+    executorch_root = test_config.PROJECT_DIR
+
+    # Run the compilation script as a module (like run_aot_example.sh does)
+    cmd = [
+        sys.executable,
+        "-m",
+        "examples.nxp.aot_neutron_compile",
+        "--model_name",
+        "mobilenetv2",
+        "--delegate",
+        "--quantize",
+        "--target",
+        "imxrt700",
+        "--remove-quant-io-ops",
+        "--use_channels_last_dim_order",
+        "--use_profiling",  # Generate profilable model and create ETRecord
+        "--use_random_dataset",  # Avoid downloading the dataset.
+    ]
+
+    # Output files will be created in executorch_root.
+    pte_file = Path(
+        os.path.join(executorch_root, "mobilenetv2_nxp_delegate_profile.pte")
+    )
+    etrecord_file = Path(
+        os.path.join(executorch_root, "etrecord", "mobilenetv2_etrecord.bin")
+    )
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute.
+            cwd=str(
+                executorch_root
+            ),  # Run from executorch root (like run_aot_example.sh)
+        )
+
+        # Check script ran successfully.
+        assert result.returncode == 0, (
+            f"Script failed with return code {result.returncode}\n"
+            f"STDOUT:\n{result.stdout}\n"
+            f"STDERR:\n{result.stderr}"
+        )
+
+        # Check if delegated model was created and saved.
+        assert pte_file.exists(), f"PTE file not created at {pte_file}"
+
+        # Combine stdout and stderr to capture all subprocess output, including logs.
+        process_output = result.stdout + result.stderr
+
+        # Check if nonempty Neutron to Edge map was created.
+        assert "Neutron to Edge map was created:" in process_output
+
+        # Check if ETRecord was created and saved.
+        assert "The ETRecord for the model was saved to" in process_output
+        assert etrecord_file.exists(), f"ETRecord file not created at {etrecord_file}"
+
+    finally:
+        # Clean up the generated files.
+        if pte_file.exists():
+            pte_file.unlink()
+        if etrecord_file.exists():
+            etrecord_file.unlink()
+            parent = etrecord_file.parent
+            if not any(parent.iterdir()):
+                parent.rmdir()
diff --git a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
index 27bd675a487..6aa07dbba8d 100644
--- a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
+++ b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
@@ -629,7 +629,7 @@ def test_move_activation_before_concat_quantization__conv(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -668,7 +668,7 @@ def test_move_activation_before_concat_quantization__linear(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -706,7 +706,7 @@ def test_move_activation_before_concat_quantization__addmm(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -744,7 +744,7 @@ def test_move_activation_before_concat_quantization__mm(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -788,9 +788,7 @@ def test_concat_cluster_quantization__conv(
                     "lowered_module" in node.name for node in edge_program.graph.nodes
                 )
 
-                tflite_flatbuffers_model, io_formats = converter_spy.calls[
-                    -1
-                ].return_value
+                tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
                 exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
                 exir_program_aten_quant: GraphModule = quantizer_spy.calls[
                     -1
@@ -861,9 +859,7 @@ def test_concat_cluster_quantization__linear(
                     "lowered_module" in node.name for node in edge_program.graph.nodes
                 )
 
-                tflite_flatbuffers_model, io_formats = converter_spy.calls[
-                    -1
-                ].return_value
+                tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
                 exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
                 exir_program_aten_quant: GraphModule = quantizer_spy.calls[
                     -1
diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
index 8cf7dfe3dc2..52654a482b9 100644
--- a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
+++ b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
@@ -37,7 +37,7 @@ def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker):
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     tflite_model = Model.GetRootAs(tflite_flatbuffers_model)
     sub_graph = tflite_model.Subgraphs(0)
@@ -84,7 +84,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker):
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
     # Capture generated model
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # No Transpose ops in produced TFLite model
     tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
@@ -148,7 +148,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker):
     )
 
     # Capture the converted IR model.
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Make sure the `Transpose` ops ARE in the IR model.
     tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
diff --git a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
index 0705203db06..359dfdb67e9 100644
--- a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
@@ -28,7 +28,7 @@ def test_conv2d_neutron_conversion():
 
     NodeFormatInference(edge_program_manager.exported_program()).identify_node_formats()
     edge_program_converter = EdgeProgramToIRConverter()
-    tflite_model, _ = edge_program_converter.convert_program(
+    tflite_model, *_ = edge_program_converter.convert_program(
         edge_program_manager.exported_program()
     )
 
diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
index 706d8ed3e14..af9ef08057b 100644
--- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
+++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
@@ -153,7 +153,7 @@ def test_per_channel_convolution(self, _, use_qat: bool):
                 use_neutron_for_format_conversion=False,
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
diff --git a/backends/nxp/tests/generic_tests/test_profiling.py b/backends/nxp/tests/generic_tests/test_profiling.py
new file mode 100644
index 00000000000..c922eb070c3
--- /dev/null
+++ b/backends/nxp/tests/generic_tests/test_profiling.py
@@ -0,0 +1,158 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import ast
+import logging
+import re
+
+import numpy as np
+import pytest
+import torch
+from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    NumericalStatsOutputComparator,
+)
+
+from executorch.backends.nxp.tests.models import AvgPool2dModule, SoftmaxModule
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+
+from executorch.examples.nxp.experimental.cifar_net.cifar_net import CifarNetModel
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+PATTERN_NEUTRON_MAP = r"Neutron to Edge map was created: (\{.*\})"
+
+
+def extract_map_from_logs(caplog):
+    for record in caplog.records:
+        msg = record.getMessage()
+        neutron_map_match = re.search(PATTERN_NEUTRON_MAP, msg)
+        if neutron_map_match:
+            dict_str = neutron_map_match.group(1)
+            return ast.literal_eval(dict_str)
+    return None
+
+
+class ParallelPoolModel(torch.nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv_in = torch.nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2)
+        self.avg_pool2d = torch.nn.AvgPool2d(kernel_size=2, stride=2)
+        self.conv_out = torch.nn.Conv2d(2 * channels, channels, kernel_size=1)
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        x = torch.cat((self.max_pool2d(x), self.avg_pool2d(x)), dim=1)
+        x = self.conv_out(x)
+        return x
+
+
+class TestProfiling:
+    @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True)
+    def test__softmax(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        model = SoftmaxModule(-1)
+        lower_run_compare(
+            model,
+            (10,),
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            use_profiling=True,
+            output_comparator=NumericalStatsOutputComparator(),
+        )
+
+        # Neuron map for 1D Softmax with input size 10 should contain 4 nodes:
+        # 3 Neuron kernels (pad, softmax, and slice) and 1 unmapped node used for profiling dum
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (2,),  # Pad
+            1: (2,),  # Softmax
+            2: (2,),  # Slice
+            3: (),  # Neutron Dump
+        }
+
+    def test__parallel_pool(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (1, 3, 32, 32)
+        model = ParallelPoolModel(input_shape[1])
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (6,),  # Conv2DStandardV2
+            1: (),  # Conv2DDepthwiseV2 (AvgPool)
+            2: (7,),  # MaxPool
+            3: (),  # TransposeCHW
+            4: (),  # TransposeCHW
+            5: (),  # TransposeCHW
+            6: (),  # Slice
+            7: (),  # Pad
+            8: (),  # Conv2DPointwise
+            9: (),  # Slice
+            10: (),  # Neutron Dump
+        }
+
+    @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True)
+    def test__cifar(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (1, 3, 32, 32)
+        model = CifarNetModel()
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (10,),  # Pad
+            1: (10, 11),  # Conv2DStandardV1 (Pad + Conv2d)
+            2: (12,),  # MaxPool
+            3: (13, 14),  # Conv2DStandardV1 (Pad + Conv2d)
+            4: (15,),  # MaxPool
+            5: (16, 17),  # Conv2DStandardV1 (Pad + Conv2d)
+            6: (18,),  # MaxPool
+            7: (20,),  # FullyConnected
+            8: (21,),  # Pad
+            9: (21,),  # Softmax
+            10: (21,),  # Slice
+            11: (),  # Neutron Dump
+        }
+
+    def test__avg_pool(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (2, 9, 6, 15)
+        model = AvgPool2dModule(False, 0)
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (2,),  # Pad
+            1: (2,),  # Conv2DDepthwiseDense
+            2: (2,),  # Slice
+            3: (),  # Neutron Dump
+        }
diff --git a/backends/nxp/tests/generic_tests/test_quantizer.py b/backends/nxp/tests/generic_tests/test_quantizer.py
index 3c23241e01e..6180d2fd9ae 100644
--- a/backends/nxp/tests/generic_tests/test_quantizer.py
+++ b/backends/nxp/tests/generic_tests/test_quantizer.py
@@ -432,7 +432,7 @@ def test_quantizer__linear_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -477,7 +477,7 @@ def test_quantizer__addmm_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -522,7 +522,7 @@ def test_quantizer__mm_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -567,7 +567,7 @@ def test_quantizer__conv_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
index a8cdee41830..668deb28c96 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
@@ -51,7 +51,7 @@ def test_addmm_conversion(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -84,7 +84,7 @@ def test_linear_conversion__with_bias(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
index dc442a4931c..466f596bf91 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
@@ -59,7 +59,7 @@ def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat)
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data_1 = (
         np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
index b4b828cd4e6..5ee3db6752f 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -182,7 +182,7 @@ def test_conv_dropout_quant(
                 use_neutron_for_format_conversion=False,
             ).exported_program()
 
-            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             assert not graph_contains_any(
@@ -241,7 +241,7 @@ def test_clone_pool_view_copy_quant(
                 use_neutron_for_format_conversion=False,
             ).exported_program()
 
-            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             assert not graph_contains_any(
@@ -311,7 +311,7 @@ def test_clone__to_contiguous_format(self):
         ).identify_node_formats()
 
         # Convert to the IR.
-        converted_model, _ = EdgeProgramToIRConverter().convert_program(
+        converted_model, *_ = EdgeProgramToIRConverter().convert_program(
             edge_program_manager.exported_program()
         )
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
index 828647d2113..7105514514a 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
@@ -177,7 +177,7 @@ def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape, us
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
@@ -367,7 +367,7 @@ def test_conv_transpose2d_conversion__quantized(
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
index 60dbfd1b215..79fffff3b78 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
@@ -51,7 +51,7 @@ def test_mm_conversion(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -85,7 +85,7 @@ def test_linear_conversion__without_bias(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
index e0fc0d85066..2e7f9035e8a 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
@@ -74,7 +74,7 @@ def test_convert_neg(mocker, input_shape):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data = (
         np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0
@@ -105,7 +105,7 @@ def test_convert_neg__channels_last(mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data = (
         np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
index fb25f02785a..c5c7aa55b03 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
@@ -50,7 +50,7 @@ def test_prelu_with_linear_quant_conversion(mocker, input_shape):
     ).exported_program()
 
     # Capture generated entities
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
     # Check `prelu` was not decomposed into simpler edge operators
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
index 2621baf18ee..00c10bd257d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
@@ -85,7 +85,7 @@ def test_softmax_delegation(input_shape, dim: int, mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     input_data = random_input_data(input_shape)
 
     # Make sure the tested program contains the `softmax`, and its input has the expected rank.
@@ -121,7 +121,7 @@ def test_softmax_delegation__channel_first(input_shape, dim: int, mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     input_data = random_input_data(input_shape)
 
     # Make sure the tested program contains the `softmax`.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
index cb5f398fa21..276b29da142 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
@@ -265,7 +265,7 @@ def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape, use
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     edge_program: ExportedProgram = converter_spy.call_args.args[1]
@@ -299,7 +299,7 @@ def test_view_w_conv_linear_quant_conversion(
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     edge_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
index 88ea567381f..aadef8c7731 100644
--- a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
+++ b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
@@ -251,7 +251,7 @@ def test_linear_bn_full_qat_pipeline_conversion(
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
     # Capture generated model
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py
index d5ff3680f38..ef6fe9c864c 100644
--- a/backends/nxp/tests/nsys_testing.py
+++ b/backends/nxp/tests/nsys_testing.py
@@ -101,6 +101,8 @@ def _run_delegated_executorch_program(
     mocker,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
+    use_profiling: bool = False,
+    use_neutron_for_format_conversion=True,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ) -> tuple[ExportedProgram, str]:
@@ -129,6 +131,8 @@ def wrapper(*args, **kwargs):
             delegate_to_npu=True,
             use_qat=use_qat,
             train_fn=train_fn,
+            use_profiling=use_profiling,
+            use_neutron_for_format_conversion=use_neutron_for_format_conversion,
             operators_not_to_delegate=operators_not_to_delegate,
             remove_quant_io_ops=remove_quant_io_ops,
         )
@@ -405,6 +409,8 @@ def lower_run_compare(
     reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
+    use_profiling: bool = False,
+    use_neutron_for_format_conversion=True,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ):
@@ -424,6 +430,10 @@ def lower_run_compare(
     :param reference_model: Version of the model which will be run to obtain reference output data.
     :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training).
     :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`.
+    :param use_profiling: Enable profiling for neutron delegated model.
+    :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
+                                                ensure that the IO matches the executorch partition, which will be
+                                                delegated to Neutron,
     :param operators_not_to_delegate: list of operators not to delegate.
     :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized
         version of dataset (quantized INT8 input samples).
@@ -468,6 +478,8 @@ def lower_run_compare(
         mocker,
         use_qat=use_qat,
         train_fn=train_fn,
+        use_profiling=use_profiling,
+        use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         operators_not_to_delegate=operators_not_to_delegate,
         remove_quant_io_ops=remove_quant_io_ops,
     )
diff --git a/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png
new file mode 100644
index 00000000000..50ed49f57ec
Binary files /dev/null and b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png differ
diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md
index 22499aea7ad..b8739046351 100644
--- a/docs/source/backends/nxp/nxp-overview.md
+++ b/docs/source/backends/nxp/nxp-overview.md
@@ -64,6 +64,8 @@ here https://www.nxp.com/design/design-center/software/eiq-ai-development-enviro
 
 **→{doc}`nxp-kernel-selection` — Neutron Firmware Kernel Selection support.**
 
+**→{doc}`nxp-profiling` — Neutron models profiling.**
+
 ```{toctree}
 :maxdepth: 2
 :hidden:
@@ -74,4 +76,5 @@ nxp-quantization
 tutorials/nxp-tutorials
 nxp-dim-order
 nxp-kernel-selection
+nxp-profiling
 ```
diff --git a/docs/source/backends/nxp/nxp-profiling.md b/docs/source/backends/nxp/nxp-profiling.md
new file mode 100644
index 00000000000..17e352e479d
--- /dev/null
+++ b/docs/source/backends/nxp/nxp-profiling.md
@@ -0,0 +1,205 @@
+# NXP eIQ Profiling Support
+
+
+The eIQ Neutron Backend is integrated with the
+[Developer Tools](https://docs.pytorch.org/executorch/stable/delegate-debugging.html)
+to provide visibility into delegated operator execution time.
+
+There are three steps required to obtain profiling results for an NXP‑delegated model:
+
+* Convert the model with profiling support enabled.
+* Generate the artifacts consumed by the Developer Tools (`ETRecord`, `ETDump`).
+* Create and run the Inspector class to consume these artifacts and print the results.
+
+---
+
+## Convert a model with the profiling support
+
+Profiling data is generated only for a **profilable** model. 
+To convert a model with profiling enabled, the `--use-profiling` flag must be set.
+
+See the `aot_neutron_compile.py` example and its
+[README](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md)
+for additional details.
+
+The following command creates a profilable `cifar10_nxp_delegate.pte` model and the corresponding `ETRecord` for the 
+**i.MX RT700** board:
+
+```bash
+python -m examples.nxp.aot_neutron_compile --quantize \
+    --delegate -m cifar10 \
+    --use_profiling
+```
+
+For installation details, see {doc}`nxp-overview`.
+
+---
+
+## Generate ETRecord (Optional)
+
+`ETRecord` is an optional artifact that contains model graphs and metadata used to link runtime profiling results 
+back to the eager model.
+
+The recommended approach is to enable `ETRecord` generation by passing `generate_etrecord=True` to export API calls.
+After export completes, retrieve the `ETRecord` using the `get_etrecord()` method, and save it using the `save()` method:
+
+### Example
+
+```python
+from executorch.devtools.etrecord import generate_etrecord
+
+# 1. Open a model and export the model to ATEN
+model = model.eval()
+exported_program = torch.export.export(model, example_inputs, strict=True)
+module = exported_program.module()
+
+# 2. Transform and lower
+compile_spec = generate_neutron_compile_spec("imxrt700")
+partitioners = (
+    [
+        NeutronPartitioner(
+            compile_spec,
+            NeutronTargetSpec(target="imxrt700"),
+            post_quantization_state_dict=module.state_dict(),
+        )
+    ]
+)
+edge_program_manager = to_edge_transform_and_lower(
+    export(module, example_inputs, strict=True),
+    transform_passes=NeutronEdgePassManager(),
+    generate_etrecord=True,
+    partitioner=partitioners,
+    compile_config=EdgeCompileConfig(
+        _core_aten_ops_exception_list=core_aten_ops_exception_list,
+    ),
+)
+
+# 3. Export to ExecuTorch program
+exec_prog = edge_program_manager.to_executorch(
+    config=ExecutorchBackendConfig(extract_delegate_segments=False)
+)
+# Save ETRecord
+exec_prog.get_etrecord().save("etrecord.bin")
+
+```
+
+### Complete Example
+
+A full implementation is available
+in [aot_neutron_compile.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/aot_neutron_compile.py).
+
+The `--use_profiling` flag is used to create a **profilable** model and the corresponding `ETRecord` file  
+(see [Convert a model with profiling support](#convert-a-model-with-profiling-support) for the full command).
+
+
+---
+
+## Generate ETDump
+
+
+The next step is to generate an `ETDump`. An `ETDump` contains runtime data collected during model inference execution.
+
+To generate an `ETDump`, ensure that the ExecuTorch runtime library is integrated with the Developer Tools and built 
+with the `ET_EVENT_TRACER_ENABLED` flag enabled.
+
+Only models converted with profiling support will produce an `ETDump` containing execution times for all Neutron 
+operators. Otherwise, the dump will include only the final delegate execution time.
+
+Neutron software provides a profiling mechanism that logs individual operator execution times to a dedicated runtime 
+output. This data is then used to generate post‑time events after the inference has completed.
+
+
+### Example
+
+```c
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+```
+```c
+// 1. Create ETDumpGen BEFORE inference.
+auto etdump_gen_ptr = std::make_unique<executorch::etdump::ETDumpGen>();
+executorch::etdump::ETDumpGen* etdump_gen = etdump_gen_ptr.get();
+
+// 2. Load a method from the program by name with ETDump generator for profiling.
+Result<Method> method = program->load_method(method_name, &memory_manager, etdump_gen);
+
+// 3. Input tensor setup.
+Tensor::SizesType sizes[] = {1, 1, 32, 32};
+Tensor::DimOrderType dim_order[] = {0, 2, 3, 1};
+TensorImpl impl(ScalarType::Float, 4, sizes, image_data, dim_order);
+Tensor tensor(&impl);
+Error status = method->set_input(tensor, 0);
+
+// 4. Execute.
+status = method->execute();
+
+// Get ETDump.
+if (etdump_gen != nullptr) {
+    executorch::etdump::ETDumpResult result = etdump_gen->get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+        PRINTF("Add a brakepoint here and run this command in Debugger Console: "
+    	       "dump binary memory trace.etdump result.buf (result.buf + result.size)\r\n");
+    }
+}
+```
+
+
+To save an `ETDump` file from the board to a PC, use the **Debug Console** in the MCUXpresso IDE:
+
+- Set a breakpoint at the `PRINTF(...)` line in the example above.
+- Enter the following command in the Debug Console and press **Enter**:
+
+  ```
+  dump binary memory trace.etdump result.buf (result.buf + result.size)
+  ```
+
+
+<figure style="border:1px solid #ccc; padding:8px; display:inline-block;">
+  <img src="../../_static/img/nxp/nxp-mcuxpresso-etdump.png" width="500" alt="Save ETDump in MCUXPresso project" />
+  <figcaption>
+        <b>Figure 1:</b> Save ETDump in MCUXPresso Project.
+  </figcaption>
+</figure>
+
+
+The resulting `ETDump` file is generated in the project folder within the MCUXpresso workspace.
+
+> **Note:**  
+> Profilable models print profiling data to the terminal. Generating this dump may take longer than executing the 
+> Neutron kernels themselves, but this overhead can be ignored as it affects only models with profiling support 
+> enabled. The dump generation time is included in the `ETDump` as the final kernel entry.
+
+---
+
+## Creating an Inspector
+
+The [Inspector](https://docs.pytorch.org/executorch/1.0/model-inspector.html) APIs provide a way to analyze the 
+contents of `ETRecord` and `ETDump`, enabling developers to gain insights into model architecture 
+and performance statistics.
+
+`ETRecord` is an optional argument used to obtain a mapping between the original model and the converted Neutron model.
+
+An `ETDump` generated on the board contains metadata for each Neutron operator, including its unique identifier.  
+To visualize this metadata in the Inspector results table, set the `include_delegate_debug_data = True` argument.
+
+### Example
+
+```python
+from executorch.devtools import Inspector
+
+inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
+inspector.print_data_tabular(include_delegate_debug_data = True)
+```
+
+### Complete Example
+
+A full implementation is available
+in [analyzing_with_inspector.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/analyzing_with_inspector.py). @lint-ignore
+
+---
+
+## Summary
+
+* Build the model with the `--use_profiling` flag enabled.
+* Build the ExecuTorch runtime library with the `ET_EVENT_TRACER_ENABLED` flag and the ETDump Developer Tool.
+* Use the Debug Console in MCUXpresso to save the `ETDump` file from the board to a PC.
+* Visualize the profiling results using the Inspector.
diff --git a/examples/nxp/analyzing_with_inspector.py b/examples/nxp/analyzing_with_inspector.py
new file mode 100644
index 00000000000..b339af79d6e
--- /dev/null
+++ b/examples/nxp/analyzing_with_inspector.py
@@ -0,0 +1,58 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Print profiling table for the NXP Neutron NPU model
+
+from typing import Any, Union
+
+from executorch.devtools import Inspector
+
+
+def parse_delegate_metadata(
+    delegate_metadatas: list[bytes],
+) -> Union[list[str], dict[str, Any]]:
+    """Metadata parser for Neutron Backend metadata.
+
+    The parser is a callable that deserializes the data and returns neutron kernel number.
+    The deserialized data is then added back to the corresponding event in the event block for user consumption.
+    """
+
+    metadata_list = []
+    for metadata_bytes in delegate_metadatas:
+        if len(metadata_bytes) == 1:
+            function_code = metadata_bytes[0]
+            if function_code == 0:
+                metadata_list.append("Profiling dump")
+            else:
+                metadata_list.append("Neutron kernel " + str(function_code))
+        else:
+            metadata_list.append("Invalid metadata size")
+    return metadata_list
+
+
+if __name__ == "__main__":
+
+    try:
+        etrecord_path = "etrecord/etrecord.bin"
+        etdump_path = "etdump/trace.etdump"
+        inspector = Inspector(
+            etdump_path=etdump_path,
+            etrecord=etrecord_path,
+            delegate_metadata_parser=parse_delegate_metadata,
+        )
+
+        # Access raw event data and filter quantized_decomposed nodes
+        for event_block in inspector.event_blocks:
+            for event in event_block.events:
+                if hasattr(event, "op_types") and isinstance(event.op_types, list):
+                    # Filter out quantized_decomposed ops from the actual list
+                    filtered = [
+                        op for op in event.op_types if "quantized_decomposed" not in op
+                    ]
+                    event.op_types = filtered if filtered else event.op_types
+
+        inspector.print_data_tabular(include_delegate_debug_data=True)
+    except Exception as e:
+        print(f"Error during inspection: {type(e).__name__}: {e}")
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
index f5f92d36541..258b4c87772 100644
--- a/examples/nxp/aot_neutron_compile.py
+++ b/examples/nxp/aot_neutron_compile.py
@@ -8,6 +8,7 @@
 import argparse
 import io
 import logging
+import os
 from collections import defaultdict
 
 import executorch.extension.pybindings.portable_lib
@@ -167,6 +168,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         default=False,
         help="Use QAT mode for quantization (performs two QAT training epochs)",
     )
+    parser.add_argument(
+        "--use_profiling",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable profiling for eIQ Neutron NPU delegated model",
+    )
     parser.add_argument(
         "-s",
         "--so_library",
@@ -322,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         operators_not_to_delegate=args.operators_not_to_delegate,
         fetch_constants_to_sram=args.fetch_constants_to_sram,
         dump_kernel_selection_code=args.dump_kernel_selection_code,
+        use_profiling=args.use_profiling,
     )
     partitioners = (
         [
@@ -338,6 +347,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
     edge_program_manager = to_edge_transform_and_lower(
         export(module, example_inputs, strict=True),
         transform_passes=NeutronEdgePassManager(),
+        generate_etrecord=args.use_profiling,
         partitioner=partitioners,
         compile_config=EdgeCompileConfig(
             _core_aten_ops_exception_list=core_aten_ops_exception_list,
@@ -360,6 +370,21 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         exec_prog = edge_program_manager.to_executorch(
             config=ExecutorchBackendConfig(extract_delegate_segments=False)
         )
+
+        # Generate ETRecord if profiling flag is set
+        if args.use_profiling:
+            etrecord_path = os.path.join("etrecord", f"{args.model_name}_etrecord.bin")
+            # Create directory if it doesn't exist
+            os.makedirs(os.path.dirname(etrecord_path), exist_ok=True)
+            # Save ETRecord
+            exec_prog.get_etrecord().save(etrecord_path)
+            # Notify the user about profiling enablement and ETRecord generation.
+            logging.info(
+                "The model was converted with profiling enabled. The time spent generating the profiling dump is traced as the "
+                "final delegate operation and can be ignored, as no dump is produced for non‑profilable models."
+            )
+            logging.info(f"The ETRecord for the model was saved to {etrecord_path}.")
+
     except RuntimeError as e:
         if "Missing out variants" in str(e.args[0]):
             raise RuntimeError(
@@ -378,8 +403,10 @@ def executorch_program_to_str(ep, verbose=False):
     logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}")
 
     # 6. Serialize to *.pte
-    model_name = f"{args.model_name}" + (
-        "_nxp_delegate" if args.delegate is True else ""
+    model_name = (
+        f"{args.model_name}"
+        + ("_nxp_delegate" if args.delegate is True else "")
+        + ("_profile" if args.use_profiling is True else "")
     )
     save_pte_program(exec_prog, model_name)