Arm backend: Partition boundary Q/DQ nodes for INT+FP (#16312)

martinlsm · Martin Lindström · web-flow · commit ab0212931a3e · 2025-12-18T22:41:52.000+01:00
For full INT lowering without FP support, Q/DQ nodes that are on the
boundary of a partition are not included in the partition. With INT+FP
support, the backend is able to handle them properly. Therefore, include
these boundary nodes in that setting.

To add test coverage for this new feature, a new stage called
"check_not.exir_quant_nodes" is added for TosaPipelineINT and
VgfPipeline in case both FP and INT profiles are enabled. This stage
verifies that no exir Q/DQ remains in the graph after
"to_edge_transform_and_lower" (or "partition" if
"to_edge_transform_and_lower" is omitted). In case a test fails to
partition the boundary Q/DQ nodes in INT+FP lowering, it will be
detected in "check_not.exir_quant_nodes". Tests in
test_quant_custom_meta.py will run this new check.


Signed-off-by: Martin Lindström &lt;Martin.Lindstroem@arm.com&gt;
Co-authored-by: Martin Lindström &lt;Martin.Lindstroem@arm.com&gt;
diff --git a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py
@@ -58,6 +58,14 @@ def call_operator(self, op, args, kwargs, meta):
         if not (input_dtype == torch.float32 and output_dtype == torch.int32):
             return super().call_operator(op, args, kwargs, meta)
 
+        # For some ops, qparams dtype is inconsistent with fake tensor's dtype.
+        # Skip decorating if the input is quantized and thus not floating point.
+        if (
+            "output_qparams" in input.node.meta
+            and len(input.node.meta["output_qparams"]) > 0
+        ):
+            return super().call_operator(op, args, kwargs, meta)
+
         op_full, op_ge, op_floor, op_ceil, op_where = _get_decorated_ops(op)
 
         zero = super().call_operator(
diff --git a/backends/arm/test/misc/test_mixed_type_lowering.py b/backends/arm/test/misc/test_mixed_type_lowering.py
@@ -17,6 +17,13 @@ def combine_op_dicts(*dicts):
     return {op: dict(counts) for op, counts in merged.items()}
 
 
+def repeat_op_dict(op_dict, times):
+    repeated = {}
+    for op, dtypes in op_dict.items():
+        repeated[op] = {dtype: count * times for (dtype, count) in dtypes.items()}
+    return repeated
+
+
 # TODO Figure out how to handle multiple dq/q nodes properly
 # See backends/arm/_passes/decompose_quant_nodes.py for details
 dq_tosa_ops = {
@@ -35,7 +42,6 @@ def combine_op_dicts(*dicts):
     "CEIL": {"FP32": 1},  # for rounding
     "FLOOR": {"FP32": 1},  # for rounding
 }
-q_dq_tosa_ops = combine_op_dicts(dq_tosa_ops, q_tosa_ops)
 
 
 class AddSigmoidMul(torch.nn.Module):
@@ -61,7 +67,12 @@ def test_mixed_type_lowering():
             "ADD": {"INT32": 1},  # ADD should be executed in INT32
             "MUL": {"INT32": 1},  # MUL should be executed in INT32
         },
-        q_dq_tosa_ops,
+        repeat_op_dict(
+            q_tosa_ops, 3
+        ),  # Two decomposed boundary Q nodes + one for SIGMOID
+        repeat_op_dict(
+            dq_tosa_ops, 2
+        ),  # One decomposed boundary DQ nodes + one for SIGMOID
     )
 
     pipeline.add_stage_after(
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
@@ -390,14 +390,15 @@ def __init__(
             ),
         }
         tosa_version = _require_tosa_version()
+        tosa_spec: TosaSpecification = tosa_profiles[tosa_version]
 
         compile_spec = common.get_tosa_compile_spec(
-            tosa_profiles[tosa_version],
+            tosa_spec,
             custom_path=custom_path,
             tosa_debug_mode=tosa_debug_mode,
         )
 
-        quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+        quantizer = TOSAQuantizer(tosa_spec)
         # choose 16A8W quantization config when int16 extension is requested
         if "int16" in tosa_extensions:
             quantization_config = get_symmetric_a16w8_quantization_config(
@@ -422,7 +423,7 @@ def __init__(
         )
         self.add_stage(self.tester.quantize, quant_stage, pos=0)
 
-        remove_quant_nodes_stage = (
+        remove_torch_quant_nodes_stage = (
             "to_edge_transform_and_lower"
             if use_to_edge_transform_and_lower
             else "partition"
@@ -440,7 +441,7 @@ def __init__(
                 suffix="quant_nodes",
             )
             self.add_stage_after(
-                remove_quant_nodes_stage,
+                remove_torch_quant_nodes_stage,
                 self.tester.check_not,
                 [
                     "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
@@ -449,6 +450,21 @@ def __init__(
                 suffix="quant_nodes",
             )
 
+        # For pure INT lowering, outer exir Q/DQ nodes remain in the graph because we can't partition them.
+        # In INT+FP lowering, we partition these nodes, so a check is added to verify that.
+        if tosa_spec.support_integer() and tosa_spec.support_float():
+            self.add_stage_after(
+                remove_torch_quant_nodes_stage,
+                self.tester.check_not,
+                [
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_channel_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default",
+                ],
+                suffix="exir_quant_nodes",
+            )
+
         if run_on_tosa_ref_model:
             self.add_stage(
                 self.tester.run_method_and_compare_outputs,
@@ -1093,6 +1109,12 @@ def __init__(
             transform_passes=transform_passes,
         )
 
+        remove_torch_quant_nodes_stage = (
+            "to_edge_transform_and_lower"
+            if use_to_edge_transform_and_lower
+            else "partition"
+        )
+
         if quantize:
             quantizer = VgfQuantizer(compile_spec)
             quantization_config = get_symmetric_quantization_config(
@@ -1104,12 +1126,6 @@ def __init__(
 
             self.add_stage(self.tester.quantize, quant_stage, pos=0)
 
-            remove_quant_nodes_stage = (
-                "to_edge_transform_and_lower"
-                if use_to_edge_transform_and_lower
-                else "partition"
-            )
-
             if _has_quantizable_inputs(test_data):
                 # only add stages if we have quantizable input
                 self.add_stage_after(
@@ -1122,7 +1138,7 @@ def __init__(
                     suffix="quant_nodes",
                 )
                 self.add_stage_after(
-                    remove_quant_nodes_stage,
+                    remove_torch_quant_nodes_stage,
                     self.tester.check_not,
                     [
                         "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
@@ -1141,6 +1157,21 @@ def __init__(
                 suffix="quant_nodes",
             )
 
+        # For pure INT lowering, outer exir Q/DQ nodes remain in the graph because we can't partition them.
+        # In INT+FP lowering, we partition these these nodes, so a check is added to verify that.
+        if tosa_spec.support_integer() and tosa_spec.support_float():
+            self.add_stage_after(
+                remove_torch_quant_nodes_stage,
+                self.tester.check_not,
+                [
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_channel_default",
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default",
+                ],
+                suffix="exir_quant_nodes",
+            )
+
         if run_on_vulkan_runtime:
             self.add_stage(self.tester.serialize)
             self.add_stage(
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
@@ -186,6 +186,56 @@ def __init__(
         self.additional_checks = additional_checks
         self.tosa_spec = compile_spec.tosa_spec
 
+    def _detag_boundary_nodes(
+        self, module: GraphModule, tag: str, reporter: WhyNoPartitionReporter
+    ) -> None:
+        """De-tag nodes at the partition boundary.
+
+        Remove delegation tags from quantize nodes with inputs outside the
+        partition and from dequantize nodes with outputs outside the partition.
+
+        For non Q/DQ nodes, remove the tag from the first node in the partition
+        if any input has floating-point dtype.
+
+        Args:
+            tag: The delegation tag assigned to the partition.
+            reporter: A reporter to log rejected nodes.
+            module: The GraphModule containing the partition.
+
+        """
+
+        # De-tag outermost q-nodes upwards and dq-nodes downwards.
+        # De-tag if at least one input/output is not part of the partition.
+        for node in module.graph.nodes:
+            if not is_partitioned(node, tag):
+                continue
+
+            is_q_node = node.target in Q_OPS
+            is_dq_node = node.target in DQ_OPS
+            is_boundary_q_node = is_q_node and not is_partitioned(
+                node.all_input_nodes[0], tag
+            )
+            is_boundary_dq_node = is_dq_node and any(
+                not is_partitioned(user, tag) for user in node.users
+            )
+
+            if is_boundary_q_node or is_boundary_dq_node:
+                # Remove tag from quantize node with input outside partition,
+                # or dequantize node with any output outside partition
+                del node.meta["delegation_tag"]
+            elif not is_q_node and not is_dq_node:
+                # For non Q/DQ nodes, remove tag from first node in partition if any input has fp dtype
+                for input in node.all_input_nodes:
+                    if is_partitioned(input, tag):
+                        continue
+                    if get_first_fake_tensor(input).dtype.is_floating_point:
+                        reporter.report_reject(
+                            node,
+                            f"Was first node in partition and input {input.name} had fp dtype.",
+                        )
+                        del node.meta["delegation_tag"]
+                        break
+
     def _tag_module(  # noqa
         self,
         module: GraphModule,
@@ -233,39 +283,13 @@ def _tag_module(  # noqa
             for node in partition.nodes:
                 node.meta["delegation_tag"] = tag
 
-            # De-tag outermost q-nodes upwards and dq-nodes downwards.
-            # De-tag if at least one input/output is not part of the partition.
-            for node in module.graph.nodes:
-                if not is_partitioned(node, tag):
-                    continue
-                if node.target in Q_OPS:
-                    for input in node.all_input_nodes:
-                        if not is_partitioned(input, tag):
-                            del node.meta["delegation_tag"]
-                            break
-                    continue
-
-                if node.target in DQ_OPS:
-                    for user in node.users:
-                        if not is_partitioned(user, tag):
-                            del node.meta["delegation_tag"]
-                            break
-                    continue
-
-                if self.tosa_spec.support_float():
-                    continue
-
-                if is_partitioned(node, tag):
-                    for input in node.all_input_nodes:
-                        if is_partitioned(input, tag):
-                            continue
-                        if get_first_fake_tensor(input).dtype.is_floating_point:
-                            reporter.report_reject(
-                                node,
-                                f"Was first node in partition and input {input.name} had fp dtype.",
-                            )
-                            del node.meta["delegation_tag"]
-                            break
+            if self.tosa_spec.support_integer() and not self.tosa_spec.support_float():
+                # Detag boundary Q/DQ since we cannot handle them without float support
+                self._detag_boundary_nodes(
+                    module,
+                    tag,
+                    reporter,
+                )
 
             is_noop_partition = all(
                 is_noop_clone(node)