diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py index 7266620cb..94de50065 100644 --- a/src/winml/modelkit/commands/build.py +++ b/src/winml/modelkit/commands/build.py @@ -494,6 +494,9 @@ def _validate_loader_tasks_for_model( help="Maximum autoconf re-optimization rounds (default: 3). --no-analyze sets this to 0.", ) @cli_utils.allow_unsupported_nodes_option() +@cli_utils.precision_option( + optional_message="When fp16, applies FP16 conversion during optimization." +) @cli_utils.trust_remote_code_option( optional_message="Trust remote code for custom model architectures (e.g., Mu2)." ) @@ -514,6 +517,7 @@ def build( analyze: bool, max_optim_iterations: int | None, allow_unsupported_nodes: bool, + precision: str | None, trust_remote_code: bool, verbose: int, quiet: bool, @@ -674,6 +678,8 @@ def _patch_device(cfg: WinMLBuildConfig) -> None: # on the key being present, matching the module-mode path which passes # allow_unsupported_nodes explicitly regardless of its value. extra_kwargs["allow_unsupported_nodes"] = allow_unsupported_nodes + if precision == "fp16": + extra_kwargs["precision"] = "fp16" if isinstance(config_or_configs, list): # ---- MODULE MODE: array config, one build per submodule ---- @@ -1119,6 +1125,45 @@ def _on_reoptimize(autoconf_dict: dict) -> None: return current_path, opt_elapsed +def _run_fp16_stage( + *, + model_path: Path, + stage_timings: list[tuple[str, float | None]], +) -> Path: + """Run FP16 conversion stage on an ONNX model file. + + Loads the model, applies FP16 conversion with keep_io_types=True, + and overwrites the file in-place. + + Args: + model_path: Path to the ONNX model to convert. + stage_timings: List to append (stage_name, elapsed) tuple to. + + Returns: + The same model_path (overwritten with FP16 model). + """ + from ..onnx import load_onnx, save_onnx + from ..optim.fp16 import convert_to_fp16 + from ..utils.console import StageLive + + with StageLive("fp16", console) as sl: + sl.set_status("Converting to FP16...") + t0 = time.monotonic() + + model = load_onnx(model_path) + model = convert_to_fp16(model, keep_io_types=True) + save_onnx(model, model_path) + + elapsed = time.monotonic() - t0 + sl.set_done(elapsed) + sl.detail("[dim]I/O types preserved as FP32[/dim]") + sl.artifact(str(model_path), _safe_size(model_path)) + sl.blank() + + stage_timings.append(("FP16", elapsed)) + return model_path + + def _run_quantize_stage( *, config: WinMLBuildConfig, @@ -1378,6 +1423,8 @@ def _name(base: str) -> str: stage_timings.append(("Export", _export_elapsed)) + _precision = extra_kwargs.pop("precision", None) + # ── Optimize stage ─────────────────────────────────────────── current_path, _ = _run_optimize_stage( config=config, @@ -1395,13 +1442,24 @@ def _name(base: str) -> str: # Persist config after autoconf config_path.write_text(json.dumps(config.to_dict(), indent=2)) - # ── Quantize stage ─────────────────────────────────────────── - current_path = _run_quantize_stage( - config=config, - current_path=current_path, - quantized_path=quantized_path, - stage_timings=stage_timings, - ) + # ── FP16 conversion (when --precision fp16) ────────────────── + if _precision == "fp16": + current_path = _run_fp16_stage( + model_path=current_path, + stage_timings=stage_timings, + ) + + # ── Quantize stage (skipped when FP16 — incompatible) ──────── + if _precision == "fp16" and config.quant is not None: + print_stage_skip(console, "quantize", "(incompatible with --precision fp16)") + stage_timings.append(("Quantize", None)) + else: + current_path = _run_quantize_stage( + config=config, + current_path=current_path, + quantized_path=quantized_path, + stage_timings=stage_timings, + ) # ── Compile stage ──────────────────────────────────────────── current_path = _run_compile_stage( @@ -1437,6 +1495,7 @@ def _build_onnx_pipeline( max_iters: int = extra_kwargs.pop("hack_max_optim_iterations", 3) allow_unsupported_nodes: bool = extra_kwargs.pop("allow_unsupported_nodes", False) + _precision: str | None = extra_kwargs.pop("precision", None) # ── Validate + setup ───────────────────────────────────────── if not onnx_path.exists(): @@ -1490,13 +1549,24 @@ def _build_onnx_pipeline( config_path.write_text(json.dumps(config.to_dict(), indent=2)) - # ── Quantize stage ─────────────────────────────────────────── - current_path = _run_quantize_stage( - config=config, - current_path=current_path, - quantized_path=quantized_path, - stage_timings=stage_timings, - ) + # ── FP16 conversion (when --precision fp16) ────────────────── + if _precision == "fp16": + current_path = _run_fp16_stage( + model_path=current_path, + stage_timings=stage_timings, + ) + + # ── Quantize stage (skipped when FP16 — incompatible) ──────── + if _precision == "fp16" and config.quant is not None: + print_stage_skip(console, "quantize", "(incompatible with --precision fp16)") + stage_timings.append(("Quantize", None)) + else: + current_path = _run_quantize_stage( + config=config, + current_path=current_path, + quantized_path=quantized_path, + stage_timings=stage_timings, + ) # ── Compile stage ──────────────────────────────────────────── current_path = _run_compile_stage( diff --git a/src/winml/modelkit/commands/export.py b/src/winml/modelkit/commands/export.py index 4cae34a86..7527b039e 100644 --- a/src/winml/modelkit/commands/export.py +++ b/src/winml/modelkit/commands/export.py @@ -130,6 +130,7 @@ def _delete_onnx_with_external_data(onnx_path: Path) -> None: help='JSON with shape overrides (e.g., {"sequence_length": 2048, "height": 640}).', ) @cli_utils.build_config_option() +@cli_utils.precision_option(optional_message="When fp16, applies FP16 conversion after export.") @cli_utils.verbosity_options() @click.pass_context def export( @@ -148,6 +149,7 @@ def export( export_config: Path | None, shape_config: Path | None, config_file: Path | None, + precision: str | None, ) -> None: r"""Export HuggingFace model to ONNX format with HTP. @@ -420,6 +422,17 @@ def export( ) logger.debug("Export stats: %s", export_stats) + # Post-export FP16 conversion when --precision fp16 is specified + if precision == "fp16": + console.print("[bold]Converting to FP16...[/bold]") + from ..onnx import load_onnx, save_onnx + from ..optim.fp16 import convert_to_fp16 + + fp16_model = load_onnx(output_path) + fp16_model = convert_to_fp16(fp16_model, keep_io_types=True) + save_onnx(fp16_model, output_path) + console.print("[dim]FP16 conversion applied (I/O kept as FP32)[/dim]") + # TODO: re-enable post-export optimization (shape inference, constant folding) # Disabled: needs validation that optimize_onnx preserves HTP hierarchy tags. # from ..optim.api import optimize_onnx diff --git a/src/winml/modelkit/commands/optimize.py b/src/winml/modelkit/commands/optimize.py index 287f9a423..355ef2a2f 100644 --- a/src/winml/modelkit/commands/optimize.py +++ b/src/winml/modelkit/commands/optimize.py @@ -180,6 +180,21 @@ def capability_options(func: F) -> F: default=None, help="Configuration file (YAML/JSON)", ) +@cli_utils.precision_option(optional_message="Applies FP16 conversion after graph optimization.") +@click.option( + "--fp16-keep-io-types/--no-fp16-keep-io-types", + "fp16_keep_io_types", + default=True, + show_default=True, + help="Keep model I/O as FP32 when --precision fp16 (insert Cast at boundary)", +) +@click.option( + "--fp16-op-block-list", + "fp16_op_block_list", + type=str, + default=None, + help="Comma-separated list of op types to keep in FP32 (e.g., LayerNorm,Softmax)", +) @cli_utils.verbosity_options() @capability_options @click.pass_context # type: ignore[arg-type] # capability_options widens the signature; click stubs want positional-only ctx but we keep it keyword-callable for back-compat @@ -190,6 +205,9 @@ def optimize( model: Path | None, output: Path | None, config: Path | None, + precision: str | None, + fp16_keep_io_types: bool, + fp16_op_block_list: str | None, verbose: int, quiet: bool, **kwargs: Any, @@ -224,6 +242,17 @@ def optimize( # Basic optimization with GELU fusion winml optimize -m model.onnx -o model_opt.onnx --enable-gelu-fusion + # Convert model to FP16 (after graph optimization) + winml optimize -m model.onnx -o fp16.onnx --precision fp16 + + # FP16 without preserving I/O types + winml optimize -m model.onnx -o fp16.onnx --precision fp16 \ + --no-fp16-keep-io-types + + # FP16 with specific ops kept in FP32 + winml optimize -m model.onnx -o fp16.onnx --precision fp16 \ + --fp16-op-block-list LayerNorm,Softmax + # Use config file winml optimize -m model.onnx -c config.toml """ @@ -406,6 +435,22 @@ def optimize( optimizer = Optimizer() optimized_model = optimizer.optimize(onnx_model, **optimizer_kwargs) + # Post-optimization FP16 conversion (command-layer, not a pipe) + if precision == "fp16": + from ..optim.fp16 import convert_to_fp16 + + console.print("[bold]Converting to FP16...[/bold]") + op_block = ( + [s.strip() for s in fp16_op_block_list.split(",") if s.strip()] + if fp16_op_block_list + else None + ) + optimized_model = convert_to_fp16( + optimized_model, + keep_io_types=fp16_keep_io_types, + op_block_list=op_block, + ) + console.print("[bold]Saving optimized model...[/bold]") save_onnx(optimized_model, output) diff --git a/src/winml/modelkit/optim/__init__.py b/src/winml/modelkit/optim/__init__.py index dc1f3a983..dd0196219 100644 --- a/src/winml/modelkit/optim/__init__.py +++ b/src/winml/modelkit/optim/__init__.py @@ -27,6 +27,7 @@ from .api import optimize_onnx from .config import WinMLOptimizationConfig from .errors import ConfigurationError, ModelValidationError, OptimizationError +from .fp16 import convert_to_fp16 from .optimizer import Optimizer from .registry import ( BoolCapability, @@ -48,6 +49,7 @@ "Optimizer", "WinMLOptimizationConfig", "auto_enable_dependencies", + "convert_to_fp16", "optimize_onnx", "validate", "validate_dependencies", diff --git a/src/winml/modelkit/optim/fp16.py b/src/winml/modelkit/optim/fp16.py new file mode 100644 index 000000000..0c0b373bf --- /dev/null +++ b/src/winml/modelkit/optim/fp16.py @@ -0,0 +1,91 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""FP16 conversion utility for ONNX models. + +Provides a single entry point for FP32→FP16 model conversion, used by +all CLI commands (optimize, build, export) at the command layer. + +This is NOT an optimizer pipe — FP16 is a precision transformation (like +quantization), not a graph optimization. It runs after optimization and +before quantization in the build pipeline. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + + +if TYPE_CHECKING: + import onnx + +logger = logging.getLogger(__name__) + + +def convert_to_fp16( + model: onnx.ModelProto, + *, + keep_io_types: bool = True, + op_block_list: list[str] | None = None, +) -> onnx.ModelProto: + """Convert an ONNX model from FP32 to FP16 precision. + + Uses onnxruntime.transformers.float16.convert_float_to_float16 internally. + No new dependencies — ORT is already a project dependency. + + Note: ORT's converter mutates the model in-place and returns the same object. + + Args: + model: Input ONNX ModelProto (will be mutated in-place by ORT). + keep_io_types: If True, preserve FP32 model inputs/outputs by inserting + Cast nodes at boundaries. Recommended for CPU-safe inference. + op_block_list: Op types to keep in FP32 (e.g., ["LayerNorm", "Softmax"]). + When None, ORT uses its DEFAULT_OP_BLOCK_LIST which includes ops + known to be numerically unsafe in FP16 (e.g., TopK, CumSum, etc.). + + Returns: + The converted model (same object as input due to ORT in-place mutation). + """ + from onnx import TensorProto + from onnxruntime.transformers.float16 import convert_float_to_float16 + + # Skip if model is already FP16 (check floating-point initializer dtypes) + fp32_types = {TensorProto.FLOAT, TensorProto.DOUBLE, TensorProto.BFLOAT16} + initializers = model.graph.initializer + if initializers: + float_inits = [t for t in initializers if t.data_type in fp32_types | {TensorProto.FLOAT16}] + if float_inits and all(t.data_type == TensorProto.FLOAT16 for t in float_inits): + logger.info("Model is already FP16 — skipping conversion.") + return model + + original_nodes = len(model.graph.node) + + logger.info("Converting model to FP16...") + if keep_io_types: + logger.info(" Keeping I/O types as FP32") + if op_block_list: + logger.info(" Keeping ops in FP32: %s", op_block_list) + + converted = convert_float_to_float16( + model, + keep_io_types=keep_io_types, + op_block_list=op_block_list, + ) + + # ORT's converter appends Cast nodes at the end of the node list (for + # keep_io_types), which breaks topological ordering. Re-sort the graph + # using ORT's own topological sort utility. + if keep_io_types: + from onnxruntime.transformers.onnx_model import OnnxModel + + OnnxModel.graph_topological_sort(converted.graph) + + converted_nodes = len(converted.graph.node) + if converted_nodes != original_nodes: + logger.info("FP16 conversion complete: %d -> %d nodes", original_nodes, converted_nodes) + else: + logger.info("FP16 conversion complete: %d nodes", converted_nodes) + + return converted diff --git a/src/winml/modelkit/utils/cli.py b/src/winml/modelkit/utils/cli.py index 8f50fd006..6f910a512 100644 --- a/src/winml/modelkit/utils/cli.py +++ b/src/winml/modelkit/utils/cli.py @@ -396,6 +396,34 @@ def allow_unsupported_nodes_option(optional_message: str | None = None) -> Calla ) +def precision_option( + required: bool = False, + optional_message: str | None = None, +) -> Callable[[F], F]: + """Add shared --precision option to a Click command. + + Consistent with winml perf, winml eval, winml config. Values: fp32, fp16. + + Args: + required: Whether the option is required. + optional_message: Extra guidance appended to help text. + + Returns: + Decorator function. + """ + help_text = "Model precision: fp32 (default) or fp16." + if optional_message: + help_text = f"{help_text} {optional_message}" + + return click.option( + "--precision", + type=click.Choice(["fp32", "fp16"]), + default=None, + required=required, + help=help_text, + ) + + def load_build_config(config_path: Path) -> tuple[WinMLBuildConfig, dict]: """Load a WinMLBuildConfig from a JSON file. diff --git a/tests/unit/optim/test_fp16.py b/tests/unit/optim/test_fp16.py new file mode 100644 index 000000000..769ab323a --- /dev/null +++ b/tests/unit/optim/test_fp16.py @@ -0,0 +1,148 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""FP16 conversion utility tests. + +Tests for winml.modelkit.optim.fp16.convert_to_fp16 which converts +FP32 ONNX models to FP16 precision. + +Following Cardinal Rules: +- CARDINAL RULE #1: No hardcoded model architectures +- CARDINAL RULE #2: All tests use pytest with code-generated results +- CARDINAL RULE #3: Tests must run and pass +""" + +from __future__ import annotations + +import numpy as np +import onnx +from onnx import TensorProto, numpy_helper + +from winml.modelkit.optim import convert_to_fp16 + + +# ============================================================================= +# HELPERS +# ============================================================================= + + +def _build_simple_fp32_model() -> onnx.ModelProto: + """Build a simple FP32 model: out = x + weight.""" + x = onnx.helper.make_tensor_value_info("x", TensorProto.FLOAT, [1, 4]) + out = onnx.helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4]) + weight = numpy_helper.from_array(np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32), "weight") + add = onnx.helper.make_node("Add", ["x", "weight"], ["out"], name="add") + graph = onnx.helper.make_graph([add], "simple", [x], [out], [weight]) + return onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)]) + + +def _build_multi_op_fp32_model() -> onnx.ModelProto: + """Build a model with multiple ops: out = Relu(x + weight).""" + x = onnx.helper.make_tensor_value_info("x", TensorProto.FLOAT, [1, 4]) + out = onnx.helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4]) + weight = numpy_helper.from_array(np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32), "weight") + add = onnx.helper.make_node("Add", ["x", "weight"], ["add_out"], name="add") + relu = onnx.helper.make_node("Relu", ["add_out"], ["out"], name="relu") + graph = onnx.helper.make_graph([add, relu], "multi_op", [x], [out], [weight]) + return onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)]) + + +# ============================================================================= +# CONVERT_TO_FP16 TESTS +# ============================================================================= + + +class TestConvertToFP16: + """Test convert_to_fp16 utility function.""" + + def test_converts_weights_to_fp16(self) -> None: + """FP16 conversion converts float32 initializers to float16.""" + model = _build_simple_fp32_model() + result = convert_to_fp16(model) + + has_fp16 = any(init.data_type == TensorProto.FLOAT16 for init in result.graph.initializer) + assert has_fp16, "Expected at least one FP16 initializer after conversion" + + def test_default_keeps_io_types(self) -> None: + """Default keep_io_types=True preserves FP32 model I/O.""" + model = _build_simple_fp32_model() + result = convert_to_fp16(model, keep_io_types=True) + + for inp in result.graph.input: + assert inp.type.tensor_type.elem_type == TensorProto.FLOAT + for outp in result.graph.output: + assert outp.type.tensor_type.elem_type == TensorProto.FLOAT + + def test_keep_io_types_false_converts_io(self) -> None: + """With keep_io_types=False, model I/O becomes FP16.""" + model = _build_simple_fp32_model() + result = convert_to_fp16(model, keep_io_types=False) + + for inp in result.graph.input: + assert inp.type.tensor_type.elem_type == TensorProto.FLOAT16 + for outp in result.graph.output: + assert outp.type.tensor_type.elem_type == TensorProto.FLOAT16 + + def test_preserves_model_structure(self) -> None: + """FP16 conversion preserves graph structure (node count diff ≤ 2).""" + model = _build_multi_op_fp32_model() + original_count = len(model.graph.node) + result = convert_to_fp16(model, keep_io_types=True) + converted_count = len(result.graph.node) + + assert converted_count - original_count <= 2, ( + f"Node count changed from {original_count} to {converted_count}, " + f"difference {converted_count - original_count} exceeds threshold of 2" + ) + + def test_op_block_list_keeps_ops_in_fp32(self) -> None: + """Ops in block list should remain operating on FP32 data.""" + model = _build_multi_op_fp32_model() + result = convert_to_fp16(model, op_block_list=["Relu"]) + + op_types = [n.op_type for n in result.graph.node] + assert "Cast" in op_types, "Expected Cast nodes for blocked ops" + + def test_none_op_block_list_uses_ort_defaults(self) -> None: + """When op_block_list is None, ORT uses its DEFAULT_OP_BLOCK_LIST.""" + model = _build_simple_fp32_model() + # Should not raise — ORT applies its default safety list + result = convert_to_fp16(model, op_block_list=None) + assert result is not None + + def test_skips_already_fp16_model(self) -> None: + """If all floating-point initializers are already FP16, conversion is skipped.""" + # Build a model with FP16 initializers directly + x = onnx.helper.make_tensor_value_info("x", TensorProto.FLOAT16, [1, 4]) + out = onnx.helper.make_tensor_value_info("out", TensorProto.FLOAT16, [1, 4]) + weight_data = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float16) + weight = numpy_helper.from_array(weight_data, "weight") + add = onnx.helper.make_node("Add", ["x", "weight"], ["out"], name="add") + graph = onnx.helper.make_graph([add], "fp16_model", [x], [out], [weight]) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)]) + + original_nodes = len(model.graph.node) + result = convert_to_fp16(model) + + # Should return the same model unchanged (no Cast nodes inserted) + assert len(result.graph.node) == original_nodes + assert result is model + + def test_skips_fp16_model_with_int_initializers(self) -> None: + """FP16 model with non-float initializers (e.g. INT64 shapes) should still skip.""" + x = onnx.helper.make_tensor_value_info("x", TensorProto.FLOAT16, [1, 4]) + out = onnx.helper.make_tensor_value_info("out", TensorProto.FLOAT16, [1, 4]) + weight_data = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float16) + weight = numpy_helper.from_array(weight_data, "weight") + # INT64 initializer (e.g., shape tensor) — should be ignored by skip logic + shape_tensor = numpy_helper.from_array(np.array([1, 4], dtype=np.int64), "shape") + add = onnx.helper.make_node("Add", ["x", "weight"], ["out"], name="add") + graph = onnx.helper.make_graph([add], "fp16_mixed", [x], [out], [weight, shape_tensor]) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 17)]) + + original_nodes = len(model.graph.node) + result = convert_to_fp16(model) + + assert len(result.graph.node) == original_nodes + assert result is model diff --git a/tests/unit/optim/test_optimizer.py b/tests/unit/optim/test_optimizer.py index fe20d8401..1e51af2b7 100644 --- a/tests/unit/optim/test_optimizer.py +++ b/tests/unit/optim/test_optimizer.py @@ -698,7 +698,7 @@ def test_resolve_dependencies_method(self) -> None: def test_registered_pipes_count(self) -> None: """Verify the expected number of pipes are registered.""" Optimizer._initialize_pipes() - # Currently: RewritePipe, ORTGraphPipe, ORTFusionPipe, SurgeryPipe + # Currently: ORTGraphPipe, RewritePipe, ORTFusionPipe, SurgeryPipe assert len(Optimizer.pipes) == 4 def test_registered_pipe_names(self) -> None: