diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py index 6ec8ee80688..d9b1d9cff43 100644 --- a/backends/nxp/backend/edge_program_converter.py +++ b/backends/nxp/backend/edge_program_converter.py @@ -73,13 +73,16 @@ class EdgeProgramToIRConverter: _default_target_spec = NeutronTargetSpec("imxrt700") _default_delegation_options = CustomDelegationOptions() + def __init__(self): + self.edge_to_tflite_map = {} + def convert_program( self, edge_program: ExportedProgram, conversion_config: ConversionConfig = _default_conversion_config, neutron_target_spec: NeutronTargetSpec = _default_target_spec, custom_delegation_options: CustomDelegationOptions = _default_delegation_options, - ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]: + ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]: """ Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes. @@ -87,8 +90,11 @@ def convert_program( :param conversion_config: ConversionConfig instance. :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param custom_delegation_options: Custom user options which affect node delegation. - :return: TFLite flatbuffers as bytes. + :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping. """ + # Reset the edge to tflite map for each conversion + self.edge_to_tflite_map = {} + parameters_mapping = self.map_inputs_to_parameters(edge_program) dim_order_map = self.map_nodes_to_dim_order(edge_program) @@ -112,6 +118,9 @@ def convert_program( # Apply optimizations and finalize the model. internal_tflite_model = cc.tflite_builder.finish() + # Get the final edge to tflite mapping after optimization + self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map + # Extract the formats of the model's inputs and outputs. io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature) @@ -119,7 +128,7 @@ def convert_program( flatbuffers_builder = flatbuffers.Builder() internal_tflite_model.gen_tflite(flatbuffers_builder) - return bytes(flatbuffers_builder.Output()), io_formats + return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map @staticmethod def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext): @@ -161,7 +170,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, ] - for node in nodes: if node.op == "call_function": if node.target in qdq_related_functions and "cluster" in node.meta: @@ -173,7 +181,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex # The node was already processed alongside the Q/DQ ops. pass elif node.target in functions_converters: + # Get TFLite op count BEFORE conversion + tflite_op_count_before = len( + conversion_context.tflite_builder.get_operators().vector + ) + # Convert the node functions_converters[node.target](conversion_context).convert(node) + # Get TFLite op count AFTER conversion + tflite_op_count_after = len( + conversion_context.tflite_builder.get_operators().vector + ) + + # Track the mapping - store edge debug handle in operators. + # Get the edge debug handle so it can be associated with newly created operators. + edge_debug_handle = node.meta.get("debug_handle", None) + if ( + edge_debug_handle is not None + and tflite_op_count_after > tflite_op_count_before + ): + operators = ( + conversion_context.tflite_builder.get_operators().vector + ) + # Node converters append new operators to the TFLite builder. + # Only operators added during this conversion step (from "before" to "after") + # are tagged with the current edge_debug_handle. + for i in range(tflite_op_count_before, tflite_op_count_after): + # Store edge debug handle in operator's temporary attribute + operators[i].tmp_edge_debug_handle = edge_debug_handle + logger.d( + f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'" + ) + else: logger.e( logger.Code.NOT_IMPLEMENTED, diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py index f97a194ce87..41820c3ab61 100755 --- a/backends/nxp/backend/ir/converter/builder/model_builder.py +++ b/backends/nxp/backend/ir/converter/builder/model_builder.py @@ -85,6 +85,10 @@ class ModelBuilder: conversion_config: ConversionConfig + edge_to_tflite_map: dict[ + int, tuple[int, ...] + ] # Mapping edge debug handles to tuple of TFLite operator indices + _default_conversion_config = ConversionConfig() def __init__( @@ -105,6 +109,7 @@ def __init__( self._nchw_tensor_version = {} self._skipped_output_map = {} self._zeros_tensor_map = {} + self.edge_to_tflite_map = {} def create_zeros_tensor( self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False @@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model: self.conversion_config.optimization_blacklist, ) + # Create the final edge-to-tflite mapping after model optimization + self._create_edge_to_tflite_mapping() + self._keep_one_empty_buffer() # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference. @@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model: return self._tfl_model + def _create_edge_to_tflite_mapping(self): + """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable. + + This function should be called after all model optimizations have been applied to match the output TFLite model. + """ + + edge_to_tflite_dict = {} + for idx, op in enumerate(self.get_operators().vector): + if ( + hasattr(op, "tmp_edge_debug_handle") + and op.tmp_edge_debug_handle is not None + ): + debug_handle = op.tmp_edge_debug_handle + if debug_handle not in edge_to_tflite_dict: + edge_to_tflite_dict[debug_handle] = [] + edge_to_tflite_dict[debug_handle].append(idx) + + # Convert lists to tuples in the dictionary + self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()} + logger.i( + f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}" + ) + def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool): for tensor in outputs.tmp_outputs: try: diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py index 6e8e7b6c33b..d8d0bada57d 100755 --- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py +++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py @@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject): # If `True`, this is an extra operator added during conversion. It was not present in the original input model. tmp_added_extra: bool + # Edge program debug handle for mapping edge nodes to TFLite operators + tmp_edge_debug_handle: Optional[int] + def __init__( self, inputs: OperatorInputs = None, @@ -541,6 +544,8 @@ def __init__( self.tmp_version = 1 self.tmp_added_extra = False + self.tmp_edge_debug_handle = None + def uses_per_channel_quantization(self) -> bool: """Determine if this operator uses per-channel quantization.""" for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs): diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py index 0abee0cdc86..92b4e25a5de 100644 --- a/backends/nxp/backend/neutron_converter_manager.py +++ b/backends/nxp/backend/neutron_converter_manager.py @@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts): cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[ "dumpKernelSelectionCode" ] + if ( + hasattr(cctx.compilationOpts, "useProfiling") + and compilation_opts["useProfiling"] + ): + cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"] + cctx.compilationOpts.dumpAfterImport = "console" + cctx.compilationOpts.dumpAfterGenerate = "console" + cctx.compilationOpts.verbose = compilation_opts["useProfiling"] + return cctx @@ -81,6 +90,7 @@ def convert( target: str, delegation_tag: str, fetch_constants_to_sram: bool = False, + use_profiling: bool = False, ) -> bytes: """ Call Neutron Converter. @@ -89,6 +99,7 @@ def convert( :param target: The target platform. :param delegation_tag: The delegation tag of model partition. :param fetch_constants_to_sram: Add microcode that fetches weights from external memory. + :param use_profiling: Use profiling for neutron delegated model. This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers). :return: TFLite model with Neutron microcode as bytes. @@ -102,6 +113,7 @@ def convert( "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose", "fetchConstantsToSRAM": fetch_constants_to_sram, "dumpKernelSelectionCode": self.dump_kernel_selection_code, + "useProfiling": use_profiling, } # Try to use multiprocessing for isolation, but fall back to direct execution diff --git a/backends/nxp/backend/neutron_map.py b/backends/nxp/backend/neutron_map.py new file mode 100644 index 00000000000..e2da653daa3 --- /dev/null +++ b/backends/nxp/backend/neutron_map.py @@ -0,0 +1,457 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import logging +import re +from dataclasses import dataclass + +# example: Type: CONV_2D +# Inputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_4 +# [1]: quantized_decomposed_dequantize_per_channel_default_2 +# Outputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_5 +# Location: 4 +PATTERN_NODE = ( + r"Type:\s+(?P\w+)\s+" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Location:\s+(?P\d+)" +) +# The pattern is very similar to operator pattern +PATTERN_SUBGRAPH = ( + r"^(?P\d+)\s*" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Tensors:" +) +# example: [0]: quantized_decomposed_quantize_per_tensor_default_4 +PATTERN_IO_TENSOR_NAME = r"\[\d+\]:\s+(?P[\S]+)" +# example: Statistics for NeutronGraph "subgraph_195": +PATTERN_GRAPH = r"Statistics for NeutronGraph \"subgraph_(?P\d+)\":" +# example: NeutronOperator "subgraph_001": +# Operators: +# PAD +# CONV_2D +# Kernels: +# Pad +# Conv2DStandardV2 +# NeutronOperator "subgraph_002": +PATTERN_VERBOSE_KERNELS = ( + r"\"subgraph_(?P\d+)\"\:\s*" + r"Operators:[\s\S]*?" + r"Kernels:\s*(?P[\s\S]*?)" + r"\s*(NeutronOperator|^$|=)" +) +# example: NeutronGraph "subgraph_074": +PATTERN_VERBOSE_GRAPH = ( + r"NeutronGraph\s*\"subgraph_(?P\d+)\":(?P[\s\S]*?)\s*(^$|=)" +) +# Two graphs are expected in the input log: original and converted. +EXPECTED_GRAPHS = 2 +# List of single-input nodes that shouldn't be mapped on the same TFLite node. +SINGLE_INPUT_NODES = [ + "ABS", + "AVERAGE_POOL_2D", + "CAST", + "EXP", + "HARD_SWISH", + "LEAKY_RELU", + "LOG", + "LOGISTIC", + "MAX_POOL_2D", + "QUANTIZE", + "RSQRT", + "TANH", +] + + +@dataclass +class Node: + name: str # Name of the node. + inputs: list[str] # List of nodes inputs. + outputs: list[str] # List of nodes outputs. + location: int # Location in graph/subgraph. + + +@dataclass +class SubgraphInfo: + num: int # Subgraph number. + location: int # Location in neutron graph + inputs: list[str] # List of subgraphs inputs. + outputs: list[str] # List of subgraphs outputs. + kernels: int # Number of neutron kernels in neutron subgraph. + nodes: list[Node] # List of tflite nodes in neutron subgraph. + + +def get_tensors_name(tensors: str) -> list[str]: + """Split input string with tensor names into list of names""" + return [m.group("name") for m in re.finditer(PATTERN_IO_TENSOR_NAME, tensors)] + + +class NeutronMap: + """Mapping between Neutron, TFLite, and Edge operators based on the Neutron converter log. + + Parses the Neutron converter log to extract information about TFLite nodes and Neutron subgraphs. + Maps TFLite operators to corresponding Neutron operators. + Maps Edge operators to Neutron operators via the Edge-to-TFLite mapping. + + Attributes: + tflite_nodes (list[Node]): TFLite node information extracted from the converter log. + neutron_subgraphs (list[SubgraphInfo]): Neutron subgraph information extracted from the converter log. + neutron_graphs (list[int]): Indices of final Neutron graphs derived from neutron_subgraphs. + edge_to_tflite_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to TFLite operators. + edge_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to Neutron operators. + tflite_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from TFLite operators to Neutron operators. + + Example: + >>> map = NeutronMap(log_output, edge_to_tflite_map) + >>> neutron_to_edge_map = map.get_neutron_to_edge_map() + """ + + tflite_nodes: list[Node] + neutron_subgraphs: list[SubgraphInfo] + neutron_graphs: list[int] + edge_to_tflite_map: dict[int, tuple[int, ...]] + edge_to_neutron_map: dict[int, tuple[int, ...]] + tflite_to_neutron_map: dict[int, tuple[int, ...]] + + def __init__( + self, neutron_converter_log: str, edge_to_tflite_map: dict[int, tuple[int, ...]] + ) -> None: + """Initialize neutron map from neutron converter log. + + :param neutron_converter_log: neutron converter log obtained during model conversion. It should contain + original tflite graph and neutron graph dump. To add these dumps to converter log the dumpAfterImport and + dumpAfterGenerate flags have to be set to "console". + """ + super().__init__() + self.tflite_nodes = [] + self.neutron_subgraphs = [] + self.neutron_graphs = [] + self.edge_to_tflite_map = edge_to_tflite_map + self.tflite_to_neutron_map = {} + self.edge_to_neutron_map = {} + self.neutron_kernels_num = 0 + self._split_profiling_log(neutron_converter_log) + + def _split_profiling_log(self, log: str) -> None: + """Process profiling log to split it into original TFLite and converted Neutron nodes. + + :param log: Neutron converter log obtained during model conversion, containing the original + TFLite graph and Neutron graph dump. + :return: None. Sets class attributes tflite_nodes and neutron_subgraphs with node information. + """ + graphs = log.split("Graphs:") + # Check if there is two graphs in the input dump + if len(graphs) != EXPECTED_GRAPHS + 1: + return + optimization_dump, neutron_graph_dump = graphs[1:] + + # Get tflite model dump + tflite_graph_dump = optimization_dump.partition("= Optimize Graph =")[0] + + # Get verbose Neutron graphs located in the Extract Graphs section. + extracted_graph_dump = optimization_dump.partition("= Extract Graphs =")[ + 2 + ].partition("Generate code for NeutronGraph")[0] + + # Get list of original operators from first dumped graph. + self.tflite_nodes = [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, tflite_graph_dump) + ] + # Get list of neutron subgraphs. + self.neutron_subgraphs = self._get_neutron_subgraphs(neutron_graph_dump) + if self.neutron_subgraphs: + self._update_neutron_subgraphs_info(extracted_graph_dump) + + def _get_neutron_subgraphs(self, graph_dump: str) -> list[SubgraphInfo]: + """Parse Neutron graph dump and extract subgraph information. + + :param graph_dump: String containing the Neutron graph dump from the converter log. + :return: List of SubgraphInfo objects containing subgraph metadata and operator nodes. + """ + + def get_subgraph_nodes(subrgraph_dump: str) -> list[Node]: + """Parse subgraph dump and extract operator nodes. + + :param subgraph_dump: String containing a single Neutron subgraph definition. + :return: List of Node objects representing operators in the subgraph. + """ + return [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, subrgraph_dump) + ] + + subgraphs = graph_dump.split(r"Name: subgraph_") + if len(subgraphs) < 3: + return [] + + # Get numbers of final neutron graphs in converted model. + self.neutron_graphs = [ + int(matched_graphs.group("num")) + for matched_graphs in re.finditer(PATTERN_GRAPH, subgraphs[-1]) + ] + if not self.neutron_graphs: + return [] + + # Get subgraphs + neutron_subgraphs: list[SubgraphInfo] = [] + for subgraph in subgraphs[1:]: + subgraph_match = re.search(PATTERN_SUBGRAPH, subgraph) + if not subgraph_match: + continue + neutron_subgraph = SubgraphInfo( + int(subgraph_match.group("num")), + -1, + get_tensors_name(subgraph_match.group("inputs")), + get_tensors_name(subgraph_match.group("outputs")), + 0, + get_subgraph_nodes(subgraph), + ) + neutron_subgraphs.append(neutron_subgraph) + return neutron_subgraphs + + def _update_neutron_subgraphs_info(self, extracted_graph: str) -> None: + """Update Neutron subgraphs with verbose info. + + - Set numbers of Neutron kernels in each Neutron subgraph. 99% of subgraphs contain only one Neutron kernel, + but there are some exceptions and some subgraphs can have more kernels. This number can be taken from + final Neutron graph info. + - Set Neutron subgraphs location in the final Neutron Graph. The function updates the location parameter + for each Neutron subgraph according to its position in the final Neutron graph. Location is calculated + continuously across all Neutron graphs in the model. Non-Neutron operators are skipped. + + :param extracted_graph: verbose Neutron graph dump. + """ + # Neutron graphs. + neutron_graphs = extracted_graph.split("NeutronGraph") + location_shift = 0 + for neutron_graph in neutron_graphs: + + subgraph_nodes = { + int(matched_subgraph.group("subgraph")): { + "location": i + location_shift, + "kernels": [ + kernel.replace(" ", "") + for kernel in matched_subgraph.group("kernels").split("\n") + if kernel.strip() + ], + } + for i, matched_subgraph in enumerate( + re.finditer(PATTERN_VERBOSE_KERNELS, neutron_graph) + ) + } + if not subgraph_nodes: + continue + # Update location offset according to the number of kernels in the subgraph. + location_shift += len(subgraph_nodes) + + # Neutron graphs. + graph_num = -1 + matched_graph = re.search(r"subgraph_(?P\d+)", neutron_graph) + if matched_graph: + graph_num = int(matched_graph.group("subgraph")) + + # Update number of kernels for all subgraphs. + for subgraph in self.neutron_subgraphs: + if subgraph.num in subgraph_nodes: + subgraph.kernels = len(subgraph_nodes[subgraph.num]["kernels"]) + subgraph.location = subgraph_nodes[subgraph.num]["location"] + elif subgraph.num == graph_num: + subgraph.kernels = sum( + len(s["kernels"]) for s in subgraph_nodes.values() + ) + self.neutron_kernels_num += subgraph.kernels + + def _nodes_match_by_io(self, tf_node: Node, neutron_node: Node) -> bool: + """ + Determine whether a TFLite node can be mapped to a Neutron node + based on their input and output compatibility. + + :param tf_node: Source TFLite node. + :param neutron_node: Target Neutron node. + :return: True if the nodes can be considered mapped, False otherwise. + """ + + def get_name_matches(tf_names: list[str], neutron_names: list[str]) -> int: + # Count how many names from tf_names have a corresponding match in + # neutron_names. A match is defined as: + # - exact equality, or + # - one name being a hierarchical variant of the other + # (i.e., sharing a common prefix separated by "/"). + result = 0 + for tf_name in tf_names: + # Determine if the tensor name corresponds to a special operation input. + # Matches names like "perm0", "perm1", etc. used by Transpose ops, + # and names like "padding0", "padding1", etc. used by Pad ops. + special_op = ( + "permutation" + if re.fullmatch(r"perm(\d+)?", tf_name) + else ( + "padding" + if re.fullmatch(r"padding(s)?(\d+)?", tf_name) + else None + ) + ) + for neutron_name in neutron_names: + if ( + neutron_name == tf_name + or neutron_name + "/" in tf_name + or tf_name + "/" in neutron_name + ): + result += 1 + break + + # Check if the neutron input is also the special op (Pad or Transpose) + if special_op and special_op in neutron_name: + result += 1 + break + return result + + name_matches = get_name_matches(tf_node.inputs, neutron_node.inputs) + # Map the node if all TFLite inputs match Neutron inputs. + # Note: the Neutron node may still have additional extra inputs. + if name_matches == len(tf_node.inputs): + return True + elif name_matches == len(tf_node.inputs) - 1: + # If there is only one unmatched input, check matching of outputs. + name_matches = get_name_matches(tf_node.outputs, neutron_node.outputs) + if name_matches == len(tf_node.outputs): + # Map the node if all TFLite outputs match Neutron outputs. + return True + return False + + def get_tflite_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map TFLite nodes from the original model to Neutron nodes in the converted model. + + The mapping is built based on input and output tensor names. Neutron tensors may have + exactly the same names or use the format "tflite_input/additional_name". + + :return: Dictionary mapping TFLite node indices to tuple of Neutron subgraph indices. + """ + tflite_to_neutron_dict = {} + for tf_idx, tf_node in enumerate(self.tflite_nodes): + subgraph_idxs = [] + for subgraph in self.neutron_subgraphs: + if ( + subgraph.num in self.neutron_graphs + or subgraph.location in subgraph_idxs + ): + continue + for neutron_node in subgraph.nodes: + if self._nodes_match_by_io(tf_node, neutron_node): + subgraph_idxs.append(subgraph.location) + break + # Filter subgraph_idxs to avoid mapping multiple parallel single-input nodes that consume the + # same input tensor into the same TFLite node. + subgraph_idxs = self._filter_single_input_nodes(tf_node.name, subgraph_idxs) + if subgraph_idxs: + tflite_to_neutron_dict[tf_idx] = tuple(subgraph_idxs) + + self.tflite_to_neutron_map = tflite_to_neutron_dict + return self.tflite_to_neutron_map + + def _filter_single_input_nodes( + self, node_name: str, subgraph_loc: list[int] + ) -> list[int]: + """ + Filter the Neutron-to-TFLite mapping to avoid mapping multiple parallel single-input nodes + that consume the same input tensor to a single TFLite node. + + The function checks whether the current TFLite node is a supported single-input node + (as defined in SINGLE_INPUT_NODES) and whether it is mapped to multiple Neutron nodes. + In such cases, it is possible that parallel single-input Neutron nodes were incorrectly + mapped to the same TFLite node. + + If more than one single-input Neutron node is mapped, only one is kept in the mapping: + the Neutron node whose operation name matches the operation name of the current TFLite node. + + :param node_name: Operation name of the current TFLite node. + :param subgraph_loc: List of Neutron subgraph indices whose inputs correspond to the + input of the current TFLite node. + :return: Filtered list of Neutron subgraph indices to be mapped to the current TFLite node. + """ + # Check if there can be potential issue in mapping. + if node_name in SINGLE_INPUT_NODES and len(subgraph_loc) > 1: + single_in_nodes = [] + # Find all single-input nodes in subgraph_idxs. + subgraphs = ( + subgraph + for subgraph in self.neutron_subgraphs + if subgraph.location in subgraph_loc + ) + for subgraph in subgraphs: + for neutron_node in subgraph.nodes: + if neutron_node.name in SINGLE_INPUT_NODES: + single_in_nodes.append((subgraph.location, neutron_node.name)) + if len(single_in_nodes) > 0: + # Keep only the node with the matching name when multiple single-input nodes are present in subgraph_idxs. + for subgraph_id, single_in_node_name in single_in_nodes: + if single_in_node_name == node_name: + return [subgraph_id] + return [] + return subgraph_loc + + def get_edge_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map Edge nodes to Neutron nodes. + + :return: Dictionary mapping Edge node handles to tuple of Neutron subgraph indices. + """ + self.get_tflite_to_neutron_map() + edge_to_neutron_dict = {} + + for edge_handle, tflite_indices in self.edge_to_tflite_map.items(): + neutron_nodes = set() + for tf_node in tflite_indices: + if tf_node in self.tflite_to_neutron_map: + neutron_nodes.update(self.tflite_to_neutron_map[tf_node]) + if neutron_nodes: + edge_to_neutron_dict[edge_handle] = tuple(neutron_nodes) + + self.edge_to_neutron_map = edge_to_neutron_dict + return self.edge_to_neutron_map + + def get_neutron_to_edge_map(self) -> dict[int, tuple[int, ...]]: + """ + Transform edge-to-neutron map to neutron-to-edge map. + + :return: Dictionary mapping neutron_index to tuple of edge_handles + """ + if not self.edge_to_neutron_map: + _ = self.get_edge_to_neutron_map() + + neutron_to_edge = {} + + for edge_handle, neutron_indices in self.edge_to_neutron_map.items(): + for neutron_idx in neutron_indices: + if neutron_idx not in neutron_to_edge: + neutron_to_edge[neutron_idx] = [] + neutron_to_edge[neutron_idx].append(edge_handle) + + # Fill gaps with empty tuples and convert lists to tuples. + if neutron_to_edge: + max_neutron_idx = self.neutron_kernels_num + result = {} + # Add one more non-mapped event at the end of list for the Neutron Dump event. + for i in range(max_neutron_idx + 1): + if i in neutron_to_edge: + result[i] = tuple(neutron_to_edge[i]) + else: + result[i] = () + logging.info(f"Neutron to Edge map was created: {result}") + return result + else: + return {} diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index 1a84a418e92..ee711c34369 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -11,6 +11,8 @@ import logging import os import struct +import tempfile +from contextlib import contextmanager from typing import final import numpy as np @@ -26,6 +28,8 @@ from executorch.backends.nxp.backend.neutron_converter_manager import ( NeutronConverterManager, ) + +from executorch.backends.nxp.backend.neutron_map import NeutronMap from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.neutron_node_extraction import ( extract_artifacts_from_neutron_node, @@ -54,6 +58,7 @@ def __init__(self): self.use_neutron_for_format_conversion = True self.fetch_constants_to_sram = False self.dump_kernel_selection_code = False + self.use_profiling = False def _replace_colons(self, operator: str) -> str: """ @@ -70,6 +75,7 @@ def neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> "NeutronCompileSpecBuilder": """Generate compile spec for Neutron NPU @@ -83,6 +89,7 @@ def neutron_compile_spec( :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights from FLASH to SRAM. This should be used when the whole model does not fit into SRAM. :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code. + :param use_profiling: If true Neutron Converter will enable profiling for neutron delegated model :return: self for method chaining """ @@ -106,6 +113,7 @@ def neutron_compile_spec( self.use_neutron_for_format_conversion = use_neutron_for_format_conversion self.fetch_constants_to_sram = fetch_constants_to_sram self.dump_kernel_selection_code = dump_kernel_selection_code + self.use_profiling = use_profiling return self @@ -135,6 +143,10 @@ def build(self): "dump_kernel_selection_code", f"{self.dump_kernel_selection_code}".encode(), ), + CompileSpec( + "use_profiling", + f"{self.use_profiling}".encode(), + ), ] return self.compile_spec @@ -149,6 +161,7 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> list[CompileSpec]: return ( NeutronCompileSpecBuilder() @@ -160,11 +173,36 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) .build() ) +@contextmanager +def capture_fd_output(): + tmp = tempfile.TemporaryFile() + + # Save original stdout / stderr + original_stdout_fd = os.dup(1) + original_stderr_fd = os.dup(2) + + try: + # Redirect fd=1 and fd=2 to temp file + os.dup2(tmp.fileno(), 1) + os.dup2(tmp.fileno(), 2) + + yield tmp # give access to the temp file + + finally: + # Restore original fds + os.dup2(original_stdout_fd, 1) + os.dup2(original_stderr_fd, 2) + + os.close(original_stdout_fd) + os.close(original_stderr_fd) + + @final class NeutronBackend(BackendDetails): @@ -185,6 +223,7 @@ def preprocess( # noqa C901 use_neutron_for_format_conversion = None fetch_constants_to_sram = False dump_kernel_selection_code = None + use_profiling = False for spec in compile_spec: if spec.key == "output_format": output_format = spec.value.decode() @@ -200,6 +239,8 @@ def preprocess( # noqa C901 fetch_constants_to_sram = spec.value.decode() == "True" if spec.key == "dump_kernel_selection_code": dump_kernel_selection_code = spec.value.decode() == "True" + if spec.key == "use_profiling": + use_profiling = spec.value.decode() == "True" # Check that the output format is set in the compile spec if not output_format: @@ -229,19 +270,32 @@ def preprocess( # noqa C901 if use_neutron_for_format_conversion is not None else {} ) - tflite_model, io_formats = EdgeProgramToIRConverter().convert_program( + ( + tflite_model, + io_formats, + edge_to_tflite_map, + ) = EdgeProgramToIRConverter().convert_program( edge_program, neutron_target_spec=NeutronTargetSpec(target), conversion_config=conversion_config, custom_delegation_options=CustomDelegationOptions(), ) - neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert( - tflite_model, - target, - delegation_tag, - fetch_constants_to_sram, - ) + with capture_fd_output() as tmp: + neutron_model = NeutronConverterManager( + dump_kernel_selection_code + ).convert( + tflite_model, + target, + delegation_tag, + fetch_constants_to_sram, + use_profiling, + ) + tmp.seek(0) + log_output = tmp.read().decode() + # Get mapping from tflite to neutron + map = NeutronMap(log_output, edge_to_tflite_map) + neutron_to_edge_map = map.get_neutron_to_edge_map() # Dump the tflite file if intermediates_dir is set if intermediates_dir != "None": @@ -265,7 +319,9 @@ def preprocess( # noqa C901 else: raise RuntimeError(f"Unknown format {output_format}") - return PreprocessResult(processed_bytes=binary) + return PreprocessResult( + processed_bytes=binary, debug_handle_map=neutron_to_edge_map + ) class PayloadComposer: diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp index 3ea973b7c5b..6fe0482ed89 100644 --- a/backends/nxp/runtime/NeutronBackend.cpp +++ b/backends/nxp/runtime/NeutronBackend.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "NeutronDriver.h" @@ -25,6 +26,8 @@ namespace neutron { #define ALIGN_SIZE(size) \ ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1))) +#define KOPC_CALLARGS 6 // The operation for TileIR + // clang-format off /* Header schema: +----------------------------+-----------------------------+------------------------+ @@ -84,6 +87,19 @@ typedef struct { const uint8_t* outputMap; } NeutronExecutorchConfig; +typedef struct { + uint8_t eventCode; + uint8_t opCode; + uint8_t functionCode; + uint8_t timestampCode; + uint32_t time; +} NeutronSingleProfilingEvent; + +typedef struct { + NeutronSingleProfilingEvent startEvent; + NeutronSingleProfilingEvent stopEvent; +} NeutronFullProfilingEvent; + #ifdef EXTERNAL_MEM // Neutron compute has no access to FLASH. // Prefetch weights from FLASH to SRAM using memcpy. @@ -508,12 +524,11 @@ class NeutronBackend final : public PyTorchBackendInterface { } } -#ifdef NEUTRON_PROFILE - // TODO: Use trace from BackendExecutionContext. - NeutronTraceConfig trace_config{.traceConfig = 0}; - neutronSetTrace(cfg->nmh, &trace_config); +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks before neutron compute to measure how much time profiling dump + // takes + et_timestamp_t start_ticks = ::executorch::runtime::pal_current_ticks(); #endif - // Run neutron compute. NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg); if (neutronRC != ENONE) { @@ -523,6 +538,11 @@ class NeutronBackend final : public PyTorchBackendInterface { neutronRC); return Error::InvalidProgram; } +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks after neutron compute to measure how much time profiling dump + // takes + et_timestamp_t stop_ticks = ::executorch::runtime::pal_current_ticks(); +#endif // Transpose outputs. for (int i = 0; i < cfg->numOutputs; i++) { @@ -558,6 +578,53 @@ class NeutronBackend final : public PyTorchBackendInterface { } } } +#ifdef ET_EVENT_TRACER_ENABLED + // Add traced evens only if model has profiling info. + auto profile_size = cfg->profileSize; + if (profile_size > 0) { + int events_num = static_cast(profile_size / 16); + auto profiling_index = cfg->numOutputs + 1; + char* profile_info = + static_cast(cfg->dcfg.outputs[profiling_index]); + NeutronFullProfilingEvent* neutron_events = + (NeutronFullProfilingEvent*)profile_info; + executorch::runtime::EventTracer* tracer = context.event_tracer(); + uint32_t start_time = 0; + int index = 0; + // Post log neutron events from profiling output. + for (int i = 0; i < events_num; i++) { + if (start_time == 0) { + start_time = neutron_events[i].startEvent.time; + } + if (neutron_events[i].stopEvent.opCode != KOPC_CALLARGS) { + // Only KOPC_CALLARGS events can be mapped to original .pte model. + continue; + } else { + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + start_time, + neutron_events[i].stopEvent.time, + static_cast( + &neutron_events[i].startEvent.functionCode), + sizeof(uint8_t)); + start_time = 0; + index++; + } + } + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + neutron_events[events_num - 1].startEvent.time, + neutron_events[events_num - 1].stopEvent.time + stop_ticks - + start_ticks, + static_cast( + &neutron_events[events_num - 1].startEvent.functionCode), + sizeof(uint8_t)); + } +#endif return Error::Ok; } diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index 44a96010593..1309e019428 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -190,6 +190,7 @@ def to_quantized_edge_program( use_quant_state_dict: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, delegate_to_npu=True, ) -> EdgeProgramManager: _neutron_target_spec = NeutronTargetSpec(target) @@ -223,6 +224,7 @@ def to_quantized_edge_program( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) post_quant_state_dict = ( exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None @@ -244,6 +246,7 @@ def to_quantized_edge_program( export(exir_program_aten__module_quant, example_input, strict=True), transform_passes=NeutronEdgePassManager(), partitioner=partitioners, + generate_etrecord=use_profiling, compile_config=EdgeCompileConfig( _check_ir_validity=False, _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -274,6 +277,7 @@ def to_quantized_executorch_program( use_neutron_for_format_conversion: bool = True, dataset_dir: str | None = None, delegate_to_npu=True, + use_profiling: bool = False, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> ExecutorchProgramManager: @@ -295,6 +299,7 @@ def to_quantized_executorch_program( train_fn=train_fn, use_neutron_for_format_conversion=use_neutron_for_format_conversion, delegate_to_npu=delegate_to_npu, + use_profiling=use_profiling, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, **get_calibration_inputs_fn, diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py index 319f372b5fa..94e91a31b95 100644 --- a/backends/nxp/tests/executors.py +++ b/backends/nxp/tests/executors.py @@ -325,7 +325,7 @@ def convert_run_compare( if tfl_model is None: NodeFormatInference(edge_program).identify_node_formats() - tfl_model, _ = EdgeProgramToIRConverter().convert_program( + tfl_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program, conversion_config ) diff --git a/backends/nxp/tests/generic_tests/test_aot_example.py b/backends/nxp/tests/generic_tests/test_aot_example.py index 893041fe372..8a1e5e49555 100644 --- a/backends/nxp/tests/generic_tests/test_aot_example.py +++ b/backends/nxp/tests/generic_tests/test_aot_example.py @@ -2,11 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - +import os import subprocess import sys from pathlib import Path +from executorch.backends.nxp.tests.config_importer import test_config + # noinspection PyProtectedMember from executorch.exir._serialize import _deserialize_pte_binary from executorch.exir.schema import DelegateCall, KernelCall @@ -15,9 +17,8 @@ def test_aot_example__mobilenet_v2(): """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated.""" - # Find the executorch root directory (5 levels up from this test file) - executorch_root = Path(__file__).parent.parent.parent.parent.parent - assert executorch_root.exists(), f"Executorch root not found at {executorch_root}" + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR # Run the compilation script as a module (like run_aot_example.sh does) cmd = [ @@ -34,14 +35,14 @@ def test_aot_example__mobilenet_v2(): ] # Output file will be created in executorch_root - pte_file = executorch_root / "mobilenetv2_nxp_delegate.pte" + pte_file = Path(os.path.join(executorch_root, "mobilenetv2_nxp_delegate.pte")) try: result = subprocess.run( cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout just in case. On my machine, the test usually runs ~1 minute. + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. cwd=str( executorch_root ), # Run from executorch root (like run_aot_example.sh) @@ -95,3 +96,77 @@ def test_aot_example__mobilenet_v2(): # Clean up the generated file if pte_file.exists(): pte_file.unlink() + + +def test_aot_example__mobilenet_v2__profiling(): + """Test that mobilenet_v2 can be lowered to Neutron backend via `aot_neutron_compile.py`, all ops are delegated, + the output model is profilable and ETRecord is generated properly.""" + + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR + + # Run the compilation script as a module (like run_aot_example.sh does) + cmd = [ + sys.executable, + "-m", + "examples.nxp.aot_neutron_compile", + "--model_name", + "mobilenetv2", + "--delegate", + "--quantize", + "--target", + "imxrt700", + "--remove-quant-io-ops", + "--use_channels_last_dim_order", + "--use_profiling", # Generate profilable model and create ETRecord + "--use_random_dataset", # Avoid downloading the dataset. + ] + + # Output files will be created in executorch_root. + pte_file = Path( + os.path.join(executorch_root, "mobilenetv2_nxp_delegate_profile.pte") + ) + etrecord_file = Path( + os.path.join(executorch_root, "etrecord", "mobilenetv2_etrecord.bin") + ) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. + cwd=str( + executorch_root + ), # Run from executorch root (like run_aot_example.sh) + ) + + # Check script ran successfully. + assert result.returncode == 0, ( + f"Script failed with return code {result.returncode}\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + # Check if delegated model was created and saved. + assert pte_file.exists(), f"PTE file not created at {pte_file}" + + # Combine stdout and stderr to capture all subprocess output, including logs. + process_output = result.stdout + result.stderr + + # Check if nonempty Neutron to Edge map was created. + assert "Neutron to Edge map was created:" in process_output + + # Check if ETRecord was created and saved. + assert "The ETRecord for the model was saved to" in process_output + assert etrecord_file.exists(), f"ETRecord file not created at {etrecord_file}" + + finally: + # Clean up the generated files. + if pte_file.exists(): + pte_file.unlink() + if etrecord_file.exists(): + etrecord_file.unlink() + parent = etrecord_file.parent + if not any(parent.iterdir()): + parent.rmdir() diff --git a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py index 27bd675a487..6aa07dbba8d 100644 --- a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py +++ b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py @@ -629,7 +629,7 @@ def test_move_activation_before_concat_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -668,7 +668,7 @@ def test_move_activation_before_concat_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -706,7 +706,7 @@ def test_move_activation_before_concat_quantization__addmm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -744,7 +744,7 @@ def test_move_activation_before_concat_quantization__mm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -788,9 +788,7 @@ def test_concat_cluster_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 @@ -861,9 +859,7 @@ def test_concat_cluster_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py index 8cf7dfe3dc2..52654a482b9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py +++ b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py @@ -37,7 +37,7 @@ def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker): ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return tflite_model = Model.GetRootAs(tflite_flatbuffers_model) sub_graph = tflite_model.Subgraphs(0) @@ -84,7 +84,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker): exported_program: ExportedProgram = converter_spy.call_args.args[1] # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # No Transpose ops in produced TFLite model tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) @@ -148,7 +148,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker): ) # Capture the converted IR model. - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Make sure the `Transpose` ops ARE in the IR model. tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) diff --git a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py index 0705203db06..359dfdb67e9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py +++ b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py @@ -28,7 +28,7 @@ def test_conv2d_neutron_conversion(): NodeFormatInference(edge_program_manager.exported_program()).identify_node_formats() edge_program_converter = EdgeProgramToIRConverter() - tflite_model, _ = edge_program_converter.convert_program( + tflite_model, *_ = edge_program_converter.convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py index 706d8ed3e14..af9ef08057b 100644 --- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py +++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py @@ -153,7 +153,7 @@ def test_per_channel_convolution(self, _, use_qat: bool): use_neutron_for_format_conversion=False, ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( diff --git a/backends/nxp/tests/generic_tests/test_profiling.py b/backends/nxp/tests/generic_tests/test_profiling.py new file mode 100644 index 00000000000..c922eb070c3 --- /dev/null +++ b/backends/nxp/tests/generic_tests/test_profiling.py @@ -0,0 +1,158 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import ast +import logging +import re + +import numpy as np +import pytest +import torch +from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + NumericalStatsOutputComparator, +) + +from executorch.backends.nxp.tests.models import AvgPool2dModule, SoftmaxModule +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare + +from executorch.examples.nxp.experimental.cifar_net.cifar_net import CifarNetModel + + +@pytest.fixture(autouse=True) +def reseed_model_per_test_run(): + torch.manual_seed(23) + np.random.seed(23) + + +PATTERN_NEUTRON_MAP = r"Neutron to Edge map was created: (\{.*\})" + + +def extract_map_from_logs(caplog): + for record in caplog.records: + msg = record.getMessage() + neutron_map_match = re.search(PATTERN_NEUTRON_MAP, msg) + if neutron_map_match: + dict_str = neutron_map_match.group(1) + return ast.literal_eval(dict_str) + return None + + +class ParallelPoolModel(torch.nn.Module): + def __init__(self, channels: int): + super().__init__() + self.conv_in = torch.nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2) + self.avg_pool2d = torch.nn.AvgPool2d(kernel_size=2, stride=2) + self.conv_out = torch.nn.Conv2d(2 * channels, channels, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = torch.cat((self.max_pool2d(x), self.avg_pool2d(x)), dim=1) + x = self.conv_out(x) + return x + + +class TestProfiling: + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__softmax(self, caplog, request): + caplog.set_level(logging.INFO) + model = SoftmaxModule(-1) + lower_run_compare( + model, + (10,), + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + use_profiling=True, + output_comparator=NumericalStatsOutputComparator(), + ) + + # Neuron map for 1D Softmax with input size 10 should contain 4 nodes: + # 3 Neuron kernels (pad, softmax, and slice) and 1 unmapped node used for profiling dum + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Softmax + 2: (2,), # Slice + 3: (), # Neutron Dump + } + + def test__parallel_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = ParallelPoolModel(input_shape[1]) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (6,), # Conv2DStandardV2 + 1: (), # Conv2DDepthwiseV2 (AvgPool) + 2: (7,), # MaxPool + 3: (), # TransposeCHW + 4: (), # TransposeCHW + 5: (), # TransposeCHW + 6: (), # Slice + 7: (), # Pad + 8: (), # Conv2DPointwise + 9: (), # Slice + 10: (), # Neutron Dump + } + + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__cifar(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = CifarNetModel() + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (10,), # Pad + 1: (10, 11), # Conv2DStandardV1 (Pad + Conv2d) + 2: (12,), # MaxPool + 3: (13, 14), # Conv2DStandardV1 (Pad + Conv2d) + 4: (15,), # MaxPool + 5: (16, 17), # Conv2DStandardV1 (Pad + Conv2d) + 6: (18,), # MaxPool + 7: (20,), # FullyConnected + 8: (21,), # Pad + 9: (21,), # Softmax + 10: (21,), # Slice + 11: (), # Neutron Dump + } + + def test__avg_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (2, 9, 6, 15) + model = AvgPool2dModule(False, 0) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Conv2DDepthwiseDense + 2: (2,), # Slice + 3: (), # Neutron Dump + } diff --git a/backends/nxp/tests/generic_tests/test_quantizer.py b/backends/nxp/tests/generic_tests/test_quantizer.py index 3c23241e01e..6180d2fd9ae 100644 --- a/backends/nxp/tests/generic_tests/test_quantizer.py +++ b/backends/nxp/tests/generic_tests/test_quantizer.py @@ -432,7 +432,7 @@ def test_quantizer__linear_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -477,7 +477,7 @@ def test_quantizer__addmm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -522,7 +522,7 @@ def test_quantizer__mm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -567,7 +567,7 @@ def test_quantizer__conv_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py index a8cdee41830..668deb28c96 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py @@ -51,7 +51,7 @@ def test_addmm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -84,7 +84,7 @@ def test_linear_conversion__with_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py index dc442a4931c..466f596bf91 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py @@ -59,7 +59,7 @@ def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat) # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data_1 = ( np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py index b4b828cd4e6..5ee3db6752f 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py @@ -182,7 +182,7 @@ def test_conv_dropout_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -241,7 +241,7 @@ def test_clone_pool_view_copy_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -311,7 +311,7 @@ def test_clone__to_contiguous_format(self): ).identify_node_formats() # Convert to the IR. - converted_model, _ = EdgeProgramToIRConverter().convert_program( + converted_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py index 828647d2113..7105514514a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py @@ -177,7 +177,7 @@ def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape, us ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] @@ -367,7 +367,7 @@ def test_conv_transpose2d_conversion__quantized( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py index 60dbfd1b215..79fffff3b78 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py @@ -51,7 +51,7 @@ def test_mm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -85,7 +85,7 @@ def test_linear_conversion__without_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py index e0fc0d85066..2e7f9035e8a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py @@ -74,7 +74,7 @@ def test_convert_neg(mocker, input_shape): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 @@ -105,7 +105,7 @@ def test_convert_neg__channels_last(mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py index fb25f02785a..c5c7aa55b03 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py @@ -50,7 +50,7 @@ def test_prelu_with_linear_quant_conversion(mocker, input_shape): ).exported_program() # Capture generated entities - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] # Check `prelu` was not decomposed into simpler edge operators diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py index 2621baf18ee..00c10bd257d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py @@ -85,7 +85,7 @@ def test_softmax_delegation(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`, and its input has the expected rank. @@ -121,7 +121,7 @@ def test_softmax_delegation__channel_first(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`. diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py index cb5f398fa21..276b29da142 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py @@ -265,7 +265,7 @@ def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape, use ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] @@ -299,7 +299,7 @@ def test_view_w_conv_linear_quant_conversion( ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py index 88ea567381f..aadef8c7731 100644 --- a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py +++ b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py @@ -251,7 +251,7 @@ def test_linear_bn_full_qat_pipeline_conversion( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py index d5ff3680f38..ef6fe9c864c 100644 --- a/backends/nxp/tests/nsys_testing.py +++ b/backends/nxp/tests/nsys_testing.py @@ -101,6 +101,8 @@ def _run_delegated_executorch_program( mocker, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> tuple[ExportedProgram, str]: @@ -129,6 +131,8 @@ def wrapper(*args, **kwargs): delegate_to_npu=True, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) @@ -405,6 +409,8 @@ def lower_run_compare( reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ): @@ -424,6 +430,10 @@ def lower_run_compare( :param reference_model: Version of the model which will be run to obtain reference output data. :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. + :param use_profiling: Enable profiling for neutron delegated model. + :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to + ensure that the IO matches the executorch partition, which will be + delegated to Neutron, :param operators_not_to_delegate: list of operators not to delegate. :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized version of dataset (quantized INT8 input samples). @@ -468,6 +478,8 @@ def lower_run_compare( mocker, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) diff --git a/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png new file mode 100644 index 00000000000..50ed49f57ec Binary files /dev/null and b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png differ diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md index 22499aea7ad..b8739046351 100644 --- a/docs/source/backends/nxp/nxp-overview.md +++ b/docs/source/backends/nxp/nxp-overview.md @@ -64,6 +64,8 @@ here https://www.nxp.com/design/design-center/software/eiq-ai-development-enviro **→{doc}`nxp-kernel-selection` — Neutron Firmware Kernel Selection support.** +**→{doc}`nxp-profiling` — Neutron models profiling.** + ```{toctree} :maxdepth: 2 :hidden: @@ -74,4 +76,5 @@ nxp-quantization tutorials/nxp-tutorials nxp-dim-order nxp-kernel-selection +nxp-profiling ``` diff --git a/docs/source/backends/nxp/nxp-profiling.md b/docs/source/backends/nxp/nxp-profiling.md new file mode 100644 index 00000000000..17e352e479d --- /dev/null +++ b/docs/source/backends/nxp/nxp-profiling.md @@ -0,0 +1,205 @@ +# NXP eIQ Profiling Support + + +The eIQ Neutron Backend is integrated with the +[Developer Tools](https://docs.pytorch.org/executorch/stable/delegate-debugging.html) +to provide visibility into delegated operator execution time. + +There are three steps required to obtain profiling results for an NXP‑delegated model: + +* Convert the model with profiling support enabled. +* Generate the artifacts consumed by the Developer Tools (`ETRecord`, `ETDump`). +* Create and run the Inspector class to consume these artifacts and print the results. + +--- + +## Convert a model with the profiling support + +Profiling data is generated only for a **profilable** model. +To convert a model with profiling enabled, the `--use-profiling` flag must be set. + +See the `aot_neutron_compile.py` example and its +[README](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md) +for additional details. + +The following command creates a profilable `cifar10_nxp_delegate.pte` model and the corresponding `ETRecord` for the +**i.MX RT700** board: + +```bash +python -m examples.nxp.aot_neutron_compile --quantize \ + --delegate -m cifar10 \ + --use_profiling +``` + +For installation details, see {doc}`nxp-overview`. + +--- + +## Generate ETRecord (Optional) + +`ETRecord` is an optional artifact that contains model graphs and metadata used to link runtime profiling results +back to the eager model. + +The recommended approach is to enable `ETRecord` generation by passing `generate_etrecord=True` to export API calls. +After export completes, retrieve the `ETRecord` using the `get_etrecord()` method, and save it using the `save()` method: + +### Example + +```python +from executorch.devtools.etrecord import generate_etrecord + +# 1. Open a model and export the model to ATEN +model = model.eval() +exported_program = torch.export.export(model, example_inputs, strict=True) +module = exported_program.module() + +# 2. Transform and lower +compile_spec = generate_neutron_compile_spec("imxrt700") +partitioners = ( + [ + NeutronPartitioner( + compile_spec, + NeutronTargetSpec(target="imxrt700"), + post_quantization_state_dict=module.state_dict(), + ) + ] +) +edge_program_manager = to_edge_transform_and_lower( + export(module, example_inputs, strict=True), + transform_passes=NeutronEdgePassManager(), + generate_etrecord=True, + partitioner=partitioners, + compile_config=EdgeCompileConfig( + _core_aten_ops_exception_list=core_aten_ops_exception_list, + ), +) + +# 3. Export to ExecuTorch program +exec_prog = edge_program_manager.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) +) +# Save ETRecord +exec_prog.get_etrecord().save("etrecord.bin") + +``` + +### Complete Example + +A full implementation is available +in [aot_neutron_compile.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/aot_neutron_compile.py). + +The `--use_profiling` flag is used to create a **profilable** model and the corresponding `ETRecord` file +(see [Convert a model with profiling support](#convert-a-model-with-profiling-support) for the full command). + + +--- + +## Generate ETDump + + +The next step is to generate an `ETDump`. An `ETDump` contains runtime data collected during model inference execution. + +To generate an `ETDump`, ensure that the ExecuTorch runtime library is integrated with the Developer Tools and built +with the `ET_EVENT_TRACER_ENABLED` flag enabled. + +Only models converted with profiling support will produce an `ETDump` containing execution times for all Neutron +operators. Otherwise, the dump will include only the final delegate execution time. + +Neutron software provides a profiling mechanism that logs individual operator execution times to a dedicated runtime +output. This data is then used to generate post‑time events after the inference has completed. + + +### Example + +```c +#include +``` +```c +// 1. Create ETDumpGen BEFORE inference. +auto etdump_gen_ptr = std::make_unique(); +executorch::etdump::ETDumpGen* etdump_gen = etdump_gen_ptr.get(); + +// 2. Load a method from the program by name with ETDump generator for profiling. +Result method = program->load_method(method_name, &memory_manager, etdump_gen); + +// 3. Input tensor setup. +Tensor::SizesType sizes[] = {1, 1, 32, 32}; +Tensor::DimOrderType dim_order[] = {0, 2, 3, 1}; +TensorImpl impl(ScalarType::Float, 4, sizes, image_data, dim_order); +Tensor tensor(&impl); +Error status = method->set_input(tensor, 0); + +// 4. Execute. +status = method->execute(); + +// Get ETDump. +if (etdump_gen != nullptr) { + executorch::etdump::ETDumpResult result = etdump_gen->get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + PRINTF("Add a brakepoint here and run this command in Debugger Console: " + "dump binary memory trace.etdump result.buf (result.buf + result.size)\r\n"); + } +} +``` + + +To save an `ETDump` file from the board to a PC, use the **Debug Console** in the MCUXpresso IDE: + +- Set a breakpoint at the `PRINTF(...)` line in the example above. +- Enter the following command in the Debug Console and press **Enter**: + + ``` + dump binary memory trace.etdump result.buf (result.buf + result.size) + ``` + + +
+ Save ETDump in MCUXPresso project +
+ Figure 1: Save ETDump in MCUXPresso Project. +
+
+ + +The resulting `ETDump` file is generated in the project folder within the MCUXpresso workspace. + +> **Note:** +> Profilable models print profiling data to the terminal. Generating this dump may take longer than executing the +> Neutron kernels themselves, but this overhead can be ignored as it affects only models with profiling support +> enabled. The dump generation time is included in the `ETDump` as the final kernel entry. + +--- + +## Creating an Inspector + +The [Inspector](https://docs.pytorch.org/executorch/1.0/model-inspector.html) APIs provide a way to analyze the +contents of `ETRecord` and `ETDump`, enabling developers to gain insights into model architecture +and performance statistics. + +`ETRecord` is an optional argument used to obtain a mapping between the original model and the converted Neutron model. + +An `ETDump` generated on the board contains metadata for each Neutron operator, including its unique identifier. +To visualize this metadata in the Inspector results table, set the `include_delegate_debug_data = True` argument. + +### Example + +```python +from executorch.devtools import Inspector + +inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") +inspector.print_data_tabular(include_delegate_debug_data = True) +``` + +### Complete Example + +A full implementation is available +in [analyzing_with_inspector.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/analyzing_with_inspector.py). @lint-ignore + +--- + +## Summary + +* Build the model with the `--use_profiling` flag enabled. +* Build the ExecuTorch runtime library with the `ET_EVENT_TRACER_ENABLED` flag and the ETDump Developer Tool. +* Use the Debug Console in MCUXpresso to save the `ETDump` file from the board to a PC. +* Visualize the profiling results using the Inspector. diff --git a/examples/nxp/analyzing_with_inspector.py b/examples/nxp/analyzing_with_inspector.py new file mode 100644 index 00000000000..b339af79d6e --- /dev/null +++ b/examples/nxp/analyzing_with_inspector.py @@ -0,0 +1,58 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Print profiling table for the NXP Neutron NPU model + +from typing import Any, Union + +from executorch.devtools import Inspector + + +def parse_delegate_metadata( + delegate_metadatas: list[bytes], +) -> Union[list[str], dict[str, Any]]: + """Metadata parser for Neutron Backend metadata. + + The parser is a callable that deserializes the data and returns neutron kernel number. + The deserialized data is then added back to the corresponding event in the event block for user consumption. + """ + + metadata_list = [] + for metadata_bytes in delegate_metadatas: + if len(metadata_bytes) == 1: + function_code = metadata_bytes[0] + if function_code == 0: + metadata_list.append("Profiling dump") + else: + metadata_list.append("Neutron kernel " + str(function_code)) + else: + metadata_list.append("Invalid metadata size") + return metadata_list + + +if __name__ == "__main__": + + try: + etrecord_path = "etrecord/etrecord.bin" + etdump_path = "etdump/trace.etdump" + inspector = Inspector( + etdump_path=etdump_path, + etrecord=etrecord_path, + delegate_metadata_parser=parse_delegate_metadata, + ) + + # Access raw event data and filter quantized_decomposed nodes + for event_block in inspector.event_blocks: + for event in event_block.events: + if hasattr(event, "op_types") and isinstance(event.op_types, list): + # Filter out quantized_decomposed ops from the actual list + filtered = [ + op for op in event.op_types if "quantized_decomposed" not in op + ] + event.op_types = filtered if filtered else event.op_types + + inspector.print_data_tabular(include_delegate_debug_data=True) + except Exception as e: + print(f"Error during inspection: {type(e).__name__}: {e}") diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py index f5f92d36541..258b4c87772 100644 --- a/examples/nxp/aot_neutron_compile.py +++ b/examples/nxp/aot_neutron_compile.py @@ -8,6 +8,7 @@ import argparse import io import logging +import os from collections import defaultdict import executorch.extension.pybindings.portable_lib @@ -167,6 +168,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): default=False, help="Use QAT mode for quantization (performs two QAT training epochs)", ) + parser.add_argument( + "--use_profiling", + action="store_true", + required=False, + default=False, + help="Enable profiling for eIQ Neutron NPU delegated model", + ) parser.add_argument( "-s", "--so_library", @@ -322,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): operators_not_to_delegate=args.operators_not_to_delegate, fetch_constants_to_sram=args.fetch_constants_to_sram, dump_kernel_selection_code=args.dump_kernel_selection_code, + use_profiling=args.use_profiling, ) partitioners = ( [ @@ -338,6 +347,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): edge_program_manager = to_edge_transform_and_lower( export(module, example_inputs, strict=True), transform_passes=NeutronEdgePassManager(), + generate_etrecord=args.use_profiling, partitioner=partitioners, compile_config=EdgeCompileConfig( _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -360,6 +370,21 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): exec_prog = edge_program_manager.to_executorch( config=ExecutorchBackendConfig(extract_delegate_segments=False) ) + + # Generate ETRecord if profiling flag is set + if args.use_profiling: + etrecord_path = os.path.join("etrecord", f"{args.model_name}_etrecord.bin") + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(etrecord_path), exist_ok=True) + # Save ETRecord + exec_prog.get_etrecord().save(etrecord_path) + # Notify the user about profiling enablement and ETRecord generation. + logging.info( + "The model was converted with profiling enabled. The time spent generating the profiling dump is traced as the " + "final delegate operation and can be ignored, as no dump is produced for non‑profilable models." + ) + logging.info(f"The ETRecord for the model was saved to {etrecord_path}.") + except RuntimeError as e: if "Missing out variants" in str(e.args[0]): raise RuntimeError( @@ -378,8 +403,10 @@ def executorch_program_to_str(ep, verbose=False): logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}") # 6. Serialize to *.pte - model_name = f"{args.model_name}" + ( - "_nxp_delegate" if args.delegate is True else "" + model_name = ( + f"{args.model_name}" + + ("_nxp_delegate" if args.delegate is True else "") + + ("_profile" if args.use_profiling is True else "") ) save_pte_program(exec_prog, model_name)