From 1f05f2f79e10be2dfe403a7495ef2988456d2c99 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Feb 2026 14:49:06 +0900 Subject: [PATCH 1/4] [RFC] Add grapheme_limit_codepoints function Unicode's grapheme cluster is no limit codepoints. But 1 grapheme cluster and many codepoints can maybe crash. So I would like set limit for codepoints per grapheme cluster. --- ext/intl/grapheme/grapheme.h | 1 + ext/intl/grapheme/grapheme_string.cpp | 64 +++++++++++++++++++ ext/intl/php_intl.stub.php | 7 ++ ext/intl/php_intl_arginfo.h | 10 ++- ext/intl/tests/grapheme_limit_codepoints.phpt | 20 ++++++ 5 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 ext/intl/tests/grapheme_limit_codepoints.phpt diff --git a/ext/intl/grapheme/grapheme.h b/ext/intl/grapheme/grapheme.h index 8ec470d479272..aa2f167c9cf15 100644 --- a/ext/intl/grapheme/grapheme.h +++ b/ext/intl/grapheme/grapheme.h @@ -31,5 +31,6 @@ void grapheme_close_global_iterator( void ); #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS +#define GRAPHEME_LIMIT_CODEPOINTS 32 #endif // GRAPHEME_GRAPHEME_H diff --git a/ext/intl/grapheme/grapheme_string.cpp b/ext/intl/grapheme/grapheme_string.cpp index 6dd5a002a65b8..de34b77d809b0 100644 --- a/ext/intl/grapheme/grapheme_string.cpp +++ b/ext/intl/grapheme/grapheme_string.cpp @@ -1135,4 +1135,68 @@ U_CFUNC PHP_FUNCTION(grapheme_levenshtein) efree(ustring1); } +U_CFUNC PHP_FUNCTION(grapheme_limit_codepoints) +{ + char *string; + size_t string_len = 0; + zend_long limit_codepoint = GRAPHEME_LIMIT_CODEPOINTS; + int ustring_len = 0; + UErrorCode status; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_STRING(string, string_len) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(limit_codepoint) + ZEND_PARSE_PARAMETERS_END(); + + status = U_ZERO_ERROR; + UBreakIterator *bi; + UText ut = UTEXT_INITIALIZER; + bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status ); + + if( U_FAILURE(status) ) { + intl_error_set_code( nullptr, status ); + + /* Set error messages. */ + intl_error_set_custom_msg( nullptr, "Error in grapheme_get_break_iterator" ); + RETURN_FALSE; + } + + utext_openUTF8(&ut, string, string_len, &status); + ubrk_setUText(bi, &ut, &status); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( nullptr, status ); + + /* Set error messages. */ + intl_error_set_custom_msg( nullptr, "Error opening UTF-8 text"); + + RETURN_FALSE; + } + + zend_ulong pos, before; + zend_bool ret = true; + for (before = pos = 0; pos != UBRK_DONE; ) { + pos = ubrk_next(bi); + if (pos != UBRK_DONE) { + for (zend_ulong i = before; i < (pos - before); i++) { + U8_FWD_1(string, before, (pos - before) - i); + if (i >= limit_codepoint) { + ret = false; + goto bi_close; + } + } + } + before = pos; + } + +bi_close: + ubrk_close(bi); + utext_close(&ut); + + RETURN_BOOL(ret); + +} /* }}} */ diff --git a/ext/intl/php_intl.stub.php b/ext/intl/php_intl.stub.php index 9a8f036865cd5..42bc6fded028a 100644 --- a/ext/intl/php_intl.stub.php +++ b/ext/intl/php_intl.stub.php @@ -166,6 +166,11 @@ * @cvalue UIDNA_ERROR_CONTEXTJ */ const IDNA_ERROR_CONTEXTJ = UNKNOWN; +/** + * @var int + * @cvalue GRAPHEME_LIMIT_CODEPOINTS + */ +const GRAPHEME_LIMIT_CODEPOINTS = UNKNOWN; class IntlException extends Exception { @@ -445,6 +450,8 @@ function grapheme_str_split(string $string, int $length = 1): array|false {} function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, string $locale = ""): int|false {} +function grapheme_limit_codepoints(string $string, int $limit = GRAPHEME_LIMIT_CODEPOINTS): bool {} + /** @param int $next */ function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} diff --git a/ext/intl/php_intl_arginfo.h b/ext/intl/php_intl_arginfo.h index e00e51420d46e..0365ed03d459d 100644 --- a/ext/intl/php_intl_arginfo.h +++ b/ext/intl/php_intl_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit php_intl.stub.php instead. - * Stub hash: d9e331c3a1ae46f8eae07ef0d39cb9990e74a0d1 */ + * Stub hash: 947017de0d17a87d9f1d325df76edaa7e2ed614b */ ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_ARG_OBJ_TYPE_MASK(0, timezone, IntlTimeZone|DateTimeZone, MAY_BE_STRING|MAY_BE_NULL, "null") @@ -501,6 +501,11 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_ ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, locale, IS_STRING, 0, "\"\"") ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_grapheme_limit_codepoints, 0, 1, _IS_BOOL, 0) + ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, limit, IS_LONG, 0, "GRAPHEME_LIMIT_CODEPOINTS") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) @@ -922,6 +927,7 @@ ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_str_split); ZEND_FUNCTION(grapheme_levenshtein); +ZEND_FUNCTION(grapheme_limit_codepoints); ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_utf8); @@ -1113,6 +1119,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein) + ZEND_FE(grapheme_limit_codepoints, arginfo_grapheme_limit_codepoints) ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) @@ -1224,6 +1231,7 @@ static void register_php_intl_symbols(int module_number) REGISTER_LONG_CONSTANT("IDNA_ERROR_INVALID_ACE_LABEL", UIDNA_ERROR_INVALID_ACE_LABEL, CONST_PERSISTENT); REGISTER_LONG_CONSTANT("IDNA_ERROR_BIDI", UIDNA_ERROR_BIDI, CONST_PERSISTENT); REGISTER_LONG_CONSTANT("IDNA_ERROR_CONTEXTJ", UIDNA_ERROR_CONTEXTJ, CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("GRAPHEME_LIMIT_CODEPOINTS", GRAPHEME_LIMIT_CODEPOINTS, CONST_PERSISTENT); zend_attribute *attribute_Deprecated_func_intlcal_set_0 = zend_add_function_attribute(zend_hash_str_find_ptr(CG(function_table), "intlcal_set", sizeof("intlcal_set") - 1), ZSTR_KNOWN(ZEND_STR_DEPRECATED_CAPITALIZED), 2); diff --git a/ext/intl/tests/grapheme_limit_codepoints.phpt b/ext/intl/tests/grapheme_limit_codepoints.phpt new file mode 100644 index 0000000000000..42b8a165f46f6 --- /dev/null +++ b/ext/intl/tests/grapheme_limit_codepoints.phpt @@ -0,0 +1,20 @@ +--TEST-- +grapheme_limit_codepoints() function test +--EXTENSIONS-- +intl +--FILE-- + +--EXPECT-- +bool(false) +bool(true) +bool(true) +bool(true) From 03ed4acd73a334421d31efbb4ad344c83b92a78b Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Feb 2026 15:05:24 +0900 Subject: [PATCH 2/4] Add grapheme_strlen for tests --- ext/intl/tests/grapheme_limit_codepoints.phpt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ext/intl/tests/grapheme_limit_codepoints.phpt b/ext/intl/tests/grapheme_limit_codepoints.phpt index 42b8a165f46f6..0647b91fed0e3 100644 --- a/ext/intl/tests/grapheme_limit_codepoints.phpt +++ b/ext/intl/tests/grapheme_limit_codepoints.phpt @@ -6,15 +6,23 @@ intl --EXPECT-- bool(false) +int(6) bool(true) +int(6) bool(true) +int(15) bool(true) +int(1) From ccbfb47bb9dd242ec4b8a2f78b01e534f52f53b7 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Feb 2026 15:06:36 +0900 Subject: [PATCH 3/4] Remove invalid grapheme cluster that many code points --- ext/intl/tests/grapheme_limit_codepoints.phpt | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/intl/tests/grapheme_limit_codepoints.phpt b/ext/intl/tests/grapheme_limit_codepoints.phpt index 0647b91fed0e3..5d496a6c9ac90 100644 --- a/ext/intl/tests/grapheme_limit_codepoints.phpt +++ b/ext/intl/tests/grapheme_limit_codepoints.phpt @@ -6,7 +6,6 @@ intl --EXPECT-- bool(false) -int(6) bool(true) int(6) bool(true) From 2683c5bb4b395d43070a7b2597c95fffb0656b5b Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Feb 2026 16:19:49 +0900 Subject: [PATCH 4/4] Fix unsequenced --- ext/intl/grapheme/grapheme_string.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.cpp b/ext/intl/grapheme/grapheme_string.cpp index de34b77d809b0..49956a7e5a3f0 100644 --- a/ext/intl/grapheme/grapheme_string.cpp +++ b/ext/intl/grapheme/grapheme_string.cpp @@ -1176,14 +1176,15 @@ U_CFUNC PHP_FUNCTION(grapheme_limit_codepoints) RETURN_FALSE; } - zend_ulong pos, before; + zend_ulong pos, before, pos_codepoint; zend_bool ret = true; for (before = pos = 0; pos != UBRK_DONE; ) { pos = ubrk_next(bi); if (pos != UBRK_DONE) { - for (zend_ulong i = before; i < (pos - before); i++) { - U8_FWD_1(string, before, (pos - before) - i); - if (i >= limit_codepoint) { + pos_codepoint = pos - before; + for (zend_ulong i = before, codepoint = 0; i < pos_codepoint; i++, codepoint++) { + U8_FWD_1(string, before, pos_codepoint - i); + if (codepoint >= limit_codepoint) { ret = false; goto bi_close; }