diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 94e3b69abf243..af9ae60936d8e 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -42,7 +42,8 @@ def read_f64_le(b: bytes, index: i64, /) -> float: ... def read_f64_be(b: bytes, index: i64, /) -> float: ... # Codepoint classification helpers operating on i32 codepoints (typically -# obtained via ord(s[i])). Negative inputs return False. +# obtained via ord(s[i])). Out-of-range inputs (negative, or past the maximum +# Unicode code point 0x10FFFF) return False. def isspace(c: i32, /) -> bool: ... def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... @@ -53,7 +54,8 @@ def isidentifier(c: i32, /) -> bool: ... # uppercase / lowercase expands to multiple codepoints (e.g. U+00DF # uppercases to "SS", U+FB01 to "FI"), returns the input unchanged so # the signature stays i32 -> i32. Use str.upper() / str.lower() for full -# Unicode case conversion when those cases matter. Negative inputs are -# returned unchanged. +# Unicode case conversion when those cases matter. Out-of-range inputs +# (negative, or past the maximum Unicode code point 0x10FFFF) are returned +# unchanged. def toupper(c: i32, /) -> i32: ... def tolower(c: i32, /) -> i32: ... diff --git a/mypyc/build.py b/mypyc/build.py index 13bd50fef3b1a..57438c7d5f52b 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -54,7 +54,12 @@ class ModDesc(NamedTuple): LIBRT_MODULES = [ ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]), - ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]), + ModDesc( + "librt.strings", + ["strings/librt_strings.c"], + ["strings/librt_strings.h", "strings/librt_strings_common.h"], + ["strings"], + ), ModDesc( "librt.base64", [ diff --git a/mypyc/lib-rt/strings/librt_strings.h b/mypyc/lib-rt/strings/librt_strings.h index 6c1942667ba44..33af57dc2a2a8 100644 --- a/mypyc/lib-rt/strings/librt_strings.h +++ b/mypyc/lib-rt/strings/librt_strings.h @@ -31,8 +31,9 @@ typedef struct { } StringWriterObject; // Codepoint classification helpers. Inputs are signed i32 for compatibility -// with mypyc's int32_rprimitive; negative values are non-codepoints and -// return false. Defined `static inline` so they compile statically into +// with mypyc's int32_rprimitive; out-of-range values (negative, or past the +// maximum Unicode code point 0x10FFFF) are non-codepoints and return false. +// Defined `static inline` so they compile statically into // both the librt.strings module and any mypyc-compiled extension that // includes this header, avoiding the capsule indirection that would dwarf // the work of a single Py_UNICODE_IS* macro call. @@ -58,12 +59,14 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { // PyUnicode_IsIdentifier on a 1-character string. Aborts via // CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER. static inline bool LibRTStrings_IsIdentifier(int32_t c) { - if (c < 0) return false; - if (c < 128) { + // Unsigned compare: negatives wrap to large values and skip the fast path. + if ((uint32_t)c < 128) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } + // Reject negatives and code points past the Unicode maximum. + if ((uint32_t)c > 0x10FFFF) return false; PyObject *s = PyUnicode_FromOrdinal((int)c); if (s == NULL) { CPyError_OutOfMemory(); @@ -101,9 +104,13 @@ static inline int32_t LibRTStrings_ChangeCase_slow(int32_t c, const char *method // non-ASCII delegates to str.upper on a 1-character string. Returns the // input unchanged when uppercasing expands to multiple codepoints. static inline int32_t LibRTStrings_ToUpper(int32_t c) { - if (c < 0) return c; - if (c >= 'a' && c <= 'z') return c - 32; - if (c < 128) return c; + // Unsigned compare: negatives wrap to large values and skip the fast path. + if ((uint32_t)c < 128) { + if (c >= 'a' && c <= 'z') return c - 32; + return c; + } + // Negatives and code points past the Unicode maximum are returned unchanged. + if ((uint32_t)c > 0x10FFFF) return c; return LibRTStrings_ChangeCase_slow(c, "upper"); } @@ -111,9 +118,13 @@ static inline int32_t LibRTStrings_ToUpper(int32_t c) { // non-ASCII delegates to str.lower on a 1-character string. Returns the // input unchanged when lowercasing expands to multiple codepoints. static inline int32_t LibRTStrings_ToLower(int32_t c) { - if (c < 0) return c; - if (c >= 'A' && c <= 'Z') return c + 32; - if (c < 128) return c; + // Unsigned compare: negatives wrap to large values and skip the fast path. + if ((uint32_t)c < 128) { + if (c >= 'A' && c <= 'Z') return c + 32; + return c; + } + // Negatives and code points past the Unicode maximum are returned unchanged. + if ((uint32_t)c > 0x10FFFF) return c; return LibRTStrings_ChangeCase_slow(c, "lower"); } diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 7efff12667d87..333b5962c4c99 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1449,8 +1449,9 @@ from testutil import assertRaises def test_codepoint_classifiers() -> None: - # Negative values are not codepoints. - for bad in (i32(-1), i32(-113)): + # Out-of-range values are not codepoints: negative, just past the maximum + # valid code point (0x10FFFF), and the largest i32. + for bad in (i32(-1), i32(-113), i32(0x110000), i32(0x7FFFFFFF)): assert not isspace(bad) assert not isdigit(bad) assert not isalnum(bad) @@ -1485,6 +1486,10 @@ def test_codepoint_classifiers_via_any() -> None: assert f(ord(false_input)) is False # Negative values are valid i32, just not codepoints. assert f(-1) is False + # Values within i32 range but past the maximum code point (0x10FFFF) + # are not codepoints either. + assert f(0x110000) is False + assert f(0x7FFFFFFF) is False # Inputs outside i32 range raise OverflowError through the wrapper. with assertRaises(OverflowError, "codepoint out of i32 range"): f(1 << 40) @@ -1509,8 +1514,9 @@ def _expect(c: str, method: str) -> int: def test_codepoint_case_conversion() -> None: - # Negative inputs return unchanged. - for bad in (i32(-1), i32(-113)): + # Out-of-range inputs return unchanged: negative, just past the maximum + # valid code point (0x10FFFF), and the largest i32. + for bad in (i32(-1), i32(-113), i32(0x110000), i32(0x7FFFFFFF)): assert toupper(bad) == bad assert tolower(bad) == bad # Agree with str.upper / str.lower across the full Unicode range @@ -1534,6 +1540,10 @@ def test_codepoint_case_conversion_via_any() -> None: assert f(in_cp) == out_cp # Negative values are valid i32, returned unchanged. assert f(-1) == -1 + # Values within i32 range but past the maximum code point (0x10FFFF) + # are returned unchanged. + assert f(0x110000) == 0x110000 + assert f(0x7FFFFFFF) == 0x7FFFFFFF # Inputs outside i32 range raise OverflowError through the wrapper. with assertRaises(OverflowError, "codepoint out of i32 range"): f(1 << 40)