Skip to content

Commit 9d5e32c

Browse files
JukkaLp-sawicki
andauthored
[mypyc] Fix non-deterministic compiler output due to frozensets (#21631)
Frozenset literals were emitted in an unpredictable order due to hash randomization. Make the order deterministic. I used some coding agent assist (esp. with tests). --------- Co-authored-by: Piotr Sawicki <sawickipiotr@outlook.com>
1 parent 5ef0902 commit 9d5e32c

4 files changed

Lines changed: 113 additions & 29 deletions

File tree

mypyc/codegen/emit.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22

33
from __future__ import annotations
44

5-
import pprint
65
import sys
7-
import textwrap
86
from collections.abc import Callable
97
from typing import Final
108

119
from mypyc.codegen.cstring import c_string_initializer
12-
from mypyc.codegen.literals import Literals
10+
from mypyc.codegen.literals import Literals, literal_sort_key
1311
from mypyc.common import (
1412
ATTR_PREFIX,
1513
BITMAP_BITS,
@@ -237,24 +235,16 @@ def attr(self, name: str) -> str:
237235
return ATTR_PREFIX + name
238236

239237
def object_annotation(self, obj: object, line: str) -> str:
240-
"""Build a C comment with an object's string representation.
238+
"""Build a C comment with a literal value's string representation.
241239
242-
If the comment exceeds the line length limit, it's wrapped into a
243-
multiline string (with the extra lines indented to be aligned with
244-
the first line's comment).
240+
This is a debugging aid that makes generated C easier to read.
245241
246-
If it contains illegal characters, an empty string is returned."""
247-
line_width = self._indent + len(line)
248-
formatted = pprint.pformat(obj, compact=True, indent=1, width=max(90 - line_width, 20))
249-
if any(x in formatted for x in ("/*", "*/", "\0")):
242+
If it contains illegal characters or is too long, return an empty string.
243+
"""
244+
formatted = stable_literal_repr(obj)
245+
if any(x in formatted for x in ("/*", "*/", "\0")) or len(formatted) >= 256:
250246
return ""
251-
252-
if "\n" in formatted:
253-
first_line, rest = formatted.split("\n", maxsplit=1)
254-
comment_continued = textwrap.indent(rest, (line_width + 3) * " ")
255-
return f" /* {first_line}\n{comment_continued} */"
256-
else:
257-
return f" /* {formatted} */"
247+
return f" /* {formatted} */"
258248

259249
def emit_line(self, line: str = "", *, ann: object = None) -> None:
260250
if line.startswith("}"):
@@ -1486,3 +1476,21 @@ def native_function_doc_initializer(func: FuncIR) -> str:
14861476
return "NULL"
14871477
docstring = f"{text_sig}\n--\n\n"
14881478
return c_string_initializer(docstring.encode("ascii", errors="backslashreplace"))
1479+
1480+
1481+
def stable_literal_repr(obj: object) -> str:
1482+
"""Return a single-line repr of a literal value.
1483+
1484+
Behaves like repr() for most values, but renders frozenset members in a
1485+
deterministic order (frozenset iteration order is hash-seed dependent).
1486+
"""
1487+
if isinstance(obj, frozenset):
1488+
if not obj:
1489+
return "frozenset()"
1490+
items = ", ".join(stable_literal_repr(item) for item in sorted(obj, key=literal_sort_key))
1491+
return "frozenset({" + items + "})"
1492+
elif isinstance(obj, tuple):
1493+
if len(obj) == 1:
1494+
return "(" + stable_literal_repr(obj[0]) + ",)"
1495+
return "(" + ", ".join(stable_literal_repr(item) for item in obj) + ")"
1496+
return repr(obj)

mypyc/codegen/literals.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ def record_literal(self, value: LiteralValue) -> None:
6565
elif isinstance(value, frozenset):
6666
frozenset_literals = self.frozenset_literals
6767
if value not in frozenset_literals:
68-
for item in value:
68+
# Sort members so that we don't depend on frozenset iteration order.
69+
for item in sorted(value, key=literal_sort_key):
6970
assert _is_literal_value(item)
7071
self.record_literal(item)
7172
frozenset_literals[value] = len(frozenset_literals)
@@ -140,10 +141,14 @@ def encoded_tuple_values(self) -> list[str]:
140141
return self._encode_collection_values(self.tuple_literals)
141142

142143
def encoded_frozenset_values(self) -> list[str]:
143-
return self._encode_collection_values(self.frozenset_literals)
144+
# Ensure deterministic frozenset item order by sorting items.
145+
return self._encode_collection_values(self.frozenset_literals, sort_items=True)
144146

145147
def _encode_collection_values(
146-
self, values: dict[tuple[object, ...], int] | dict[frozenset[object], int]
148+
self,
149+
values: dict[tuple[object, ...], int] | dict[frozenset[object], int],
150+
*,
151+
sort_items: bool = False,
147152
) -> list[str]:
148153
"""Encode tuple/frozenset values into a C array.
149154
@@ -164,7 +169,8 @@ def _encode_collection_values(
164169
for i in range(count):
165170
value = value_by_index[i]
166171
result.append(str(len(value)))
167-
for item in value:
172+
items = sorted(value, key=literal_sort_key) if sort_items else value
173+
for item in items:
168174
assert _is_literal_value(item)
169175
index = self.literal_index(item)
170176
result.append(str(index))
@@ -299,3 +305,13 @@ def _encode_complex_values(values: dict[complex, int]) -> list[str]:
299305
result.append(float_to_c(value.real))
300306
result.append(float_to_c(value.imag))
301307
return result
308+
309+
310+
def literal_sort_key(value: object) -> tuple[object, ...]:
311+
"""Return a sort key for a literal value."""
312+
if isinstance(value, frozenset):
313+
# Sort items to avoid depending on the unpredictable iteration order.
314+
return ("frozenset", tuple(sorted(literal_sort_key(item) for item in value)))
315+
elif isinstance(value, tuple):
316+
return ("tuple", tuple(literal_sort_key(item) for item in value))
317+
return (type(value).__name__, repr(value))

mypyc/test/test_emit.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,23 @@ def test_reg(self) -> None:
4242

4343
def test_object_annotation(self) -> None:
4444
assert self.emitter.object_annotation("hello, world", "line;") == " /* 'hello, world' */"
45-
assert self.emitter.object_annotation(list(range(30)), "line;") == """\
46-
/* [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
47-
23, 24, 25, 26, 27, 28, 29] */"""
45+
assert self.emitter.object_annotation(42, "line;") == " /* 42 */"
46+
assert self.emitter.object_annotation((1, "x", None), "line;") == " /* (1, 'x', None) */"
47+
# Annotations containing illegal C comment characters are dropped.
48+
assert self.emitter.object_annotation("a /* b */ c", "line;") == ""
49+
50+
def test_object_annotation_frozenset_is_deterministic(self) -> None:
51+
assert (
52+
self.emitter.object_annotation(frozenset({"self", "cls"}), "line;")
53+
== self.emitter.object_annotation(frozenset({"cls", "self"}), "line;")
54+
== " /* frozenset({'cls', 'self'}) */"
55+
)
56+
assert (
57+
self.emitter.object_annotation((frozenset({"b", "a"}),), "line;")
58+
== self.emitter.object_annotation((frozenset({"a", "b"}),), "line;")
59+
== " /* (frozenset({'a', 'b'}),) */"
60+
)
61+
assert self.emitter.object_annotation(frozenset(), "line;") == " /* frozenset() */"
4862

4963
def test_emit_line(self) -> None:
5064
emitter = self.emitter
@@ -55,11 +69,9 @@ def test_emit_line(self) -> None:
5569
assert emitter.fragments == ["line;\n", "a {\n", " f();\n", "}\n"]
5670
emitter = Emitter(self.context, {})
5771
emitter.emit_line("CPyStatics[0];", ann="hello, world")
58-
emitter.emit_line("CPyStatics[1];", ann=list(range(30)))
72+
emitter.emit_line("CPyStatics[1];", ann=42)
5973
assert emitter.fragments[0] == "CPyStatics[0]; /* 'hello, world' */\n"
60-
assert emitter.fragments[1] == """\
61-
CPyStatics[1]; /* [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
62-
21, 22, 23, 24, 25, 26, 27, 28, 29] */\n"""
74+
assert emitter.fragments[1] == "CPyStatics[1]; /* 42 */\n"
6375

6476
def test_emit_undefined_value_for_simple_type(self) -> None:
6577
emitter = self.emitter

mypyc/test/test_literals.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
_encode_int_values,
1111
_encode_str_values,
1212
format_str_literal,
13+
literal_sort_key,
1314
)
1415

1516

@@ -88,3 +89,50 @@ def test_tuple_literal(self) -> None:
8889
"7", # Second tuple (length=4)
8990
"0", # Third tuple (length=0)
9091
]
92+
93+
def test_frozenset_literal_index_is_deterministic(self) -> None:
94+
# Index assignment for members must not depend on frozenset iteration
95+
# order (which is hash-seed dependent), so that generated code is
96+
# reproducible.
97+
lit1 = Literals()
98+
lit1.record_literal(frozenset({"self", "cls"}))
99+
lit2 = Literals()
100+
lit2.record_literal(frozenset({"cls", "self"}))
101+
for s in ("self", "cls"):
102+
assert lit1.literal_index(s) == lit2.literal_index(s)
103+
# Members are recorded in sorted order.
104+
assert lit1.literal_index("cls") == 3
105+
assert lit1.literal_index("self") == 4
106+
107+
def test_frozenset_encoding_is_deterministic(self) -> None:
108+
lit1 = Literals()
109+
lit1.record_literal(frozenset({"self", "cls"}))
110+
lit2 = Literals()
111+
lit2.record_literal(frozenset({"cls", "self"}))
112+
assert lit1.encoded_frozenset_values() == lit2.encoded_frozenset_values()
113+
114+
def test_literal_sort_key_is_total_over_types(self) -> None:
115+
# Heterogeneous, individually unorderable items must still be sorted.
116+
values = ["x", b"y", 1, None, (1, 2), frozenset({1, 2})]
117+
values_reversed = list(reversed(values))
118+
assert sorted(values, key=literal_sort_key) == sorted(
119+
values_reversed, key=literal_sort_key
120+
)
121+
122+
def test_literal_sort_key_with_frozenset(self) -> None:
123+
assert literal_sort_key(frozenset({"a", "b"})) == literal_sort_key(frozenset({"b", "a"}))
124+
assert literal_sort_key((frozenset({"a", "b"}),)) == literal_sort_key(
125+
(frozenset({"b", "a"}),)
126+
)
127+
assert literal_sort_key(frozenset({"a", frozenset({"b", "c"})})) == literal_sort_key(
128+
frozenset({frozenset({"c", "b"}), "a"})
129+
)
130+
131+
def test_nested_frozenset_literal_index_is_deterministic(self) -> None:
132+
lit1 = Literals()
133+
lit1.record_literal(frozenset({frozenset({"a", "b"}), frozenset({"c", "d"})}))
134+
lit2 = Literals()
135+
lit2.record_literal(frozenset({frozenset({"d", "c"}), frozenset({"b", "a"})}))
136+
for s in ("a", "b", "c", "d"):
137+
assert lit1.literal_index(s) == lit2.literal_index(s)
138+
assert lit1.encoded_frozenset_values() == lit2.encoded_frozenset_values()

0 commit comments

Comments
 (0)