Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
01b4486
fix preg_replace_callback function
Shamzik Dec 10, 2025
a38a0e2
replace set_matches with template<bool is_offset_capture, bool is_unm…
Shamzik Dec 12, 2025
d797a2a
fixes
Shamzik Dec 12, 2025
c9afc9f
details namespace
Shamzik Dec 12, 2025
39518a5
revert template
Shamzik Dec 17, 2025
98d44d2
format
Shamzik Dec 17, 2025
4305ae1
move all into details namespace
Shamzik Dec 19, 2025
519a553
stash refactoring
Shamzik Dec 25, 2025
bbde33b
finalize refactoring
Shamzik Dec 29, 2025
e5bce5a
namespaces
Shamzik Dec 25, 2025
6c1017b
move valid_preg_replace_mixed into details namespace
Shamzik Dec 25, 2025
7b54150
inline reserve_buffer
Shamzik Dec 29, 2025
a2a222c
kphp namespace
Shamzik Dec 25, 2025
ddc9bd8
fix valid_regex_flags
Shamzik Dec 29, 2025
a49d222
add test
Shamzik Dec 29, 2025
1b8bc30
rc -> ret_code
Shamzik Dec 29, 2025
56ea4e4
format
Shamzik Dec 29, 2025
b7c8ac8
fixes
Shamzik Jan 12, 2026
cb78255
types
Shamzik Jan 12, 2026
cdfa1bd
string replacement
Shamzik Jan 12, 2026
84b643a
fixes
Shamzik Jan 12, 2026
e95b37f
fix count in preg_replace_callback
Shamzik Jan 12, 2026
a03504d
Merge branch 'master' into kshamazov/pcre2_functions
Shamzik Jan 16, 2026
7b89ef1
make compiled_regex to be public
Shamzik Jan 19, 2026
9b73a7e
include fix
Shamzik Jan 19, 2026
2be8393
rename usings
Shamzik Jan 19, 2026
26be506
fix
Shamzik Jan 19, 2026
1a69881
make group_name_iterator to be safe
Shamzik Jan 19, 2026
97e6c7e
index bytes
Shamzik Jan 19, 2026
a9b0e2c
make m_entry_size to be const
Shamzik Jan 19, 2026
54f16e0
name -> group_names
Shamzik Jan 19, 2026
00d8e0e
collect_group_names refactoring
Shamzik Jan 19, 2026
a5710fa
remove const from raw pcre2 types
Shamzik Jan 21, 2026
1fd041f
mv RegexInstanceState definition into header
Shamzik Jan 21, 2026
35a674d
add comments
Shamzik Jan 21, 2026
ecfb15d
make replacement to be optional
Shamzik Jan 21, 2026
d65e21d
merge assrtions
Shamzik Jan 21, 2026
621f35a
details::offset_pair constants
Shamzik Jan 21, 2026
81c35d0
substitute refactoring
Shamzik Jan 21, 2026
1adcbad
check buffer.data()
Shamzik Jan 21, 2026
1ec2278
remove default argument value for get_range
Shamzik Jan 21, 2026
3eecfe2
rename m_base_options to m_user_options
Shamzik Jan 21, 2026
d7e4d0b
format
Shamzik Jan 21, 2026
424aba2
std::formatter
Shamzik Jan 21, 2026
4eb1f63
START & END
Shamzik Jan 30, 2026
682e4ee
UPPER, LOWER & ERROR_BUFFER_LENGTH
Shamzik Jan 30, 2026
30cf521
check regex_pcre2_match_data on init
Shamzik Jan 30, 2026
c9234af
rename state fields
Shamzik Jan 30, 2026
84a0536
inline instead of static
Shamzik Jan 30, 2026
93f8361
format
Shamzik Jan 30, 2026
490857e
Merge branch 'master' into kshamazov/pcre2_functions
Shamzik Jan 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion runtime-light/coroutine/await-set.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class await_set {
}

auto next() noexcept {
return detail::await_set::await_set_awaitable<return_type>{*m_await_broker.get()};
return detail::await_set::await_set_awaitable<return_type>{*m_await_broker};
}

bool empty() const noexcept {
Expand Down
372 changes: 372 additions & 0 deletions runtime-light/stdlib/string/pcre2-functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,372 @@
// Compiler for PHP (aka KPHP)
// Copyright (c) 2025 LLC «V Kontakte»
// Distributed under the GPL v3 License, see LICENSE.notice.txt

#pragma once

#include <cstddef>
#include <cstdint>
#include <expected>
#include <format>
#include <iterator>
#include <optional>
#include <span>
#include <string_view>

#include "runtime-light/stdlib/diagnostics/logs.h"
Comment thread
apolyakov marked this conversation as resolved.
// correctly include PCRE2 lib
#include "runtime-light/stdlib/string/regex-include.h"

namespace kphp::pcre2 {

namespace details {

namespace offset_pair {

inline constexpr size_t START{0};
inline constexpr size_t END{1};
inline constexpr size_t SIZE{2};

} // namespace offset_pair

inline int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept {
// all multibyte utf8 runes consist of subsequent bytes,
// these subsequent bytes start with 10 bit pattern
// 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000)
while (offset < subject.size() && ((static_cast<unsigned char>(subject[offset])) & 0xc0) == 0x80) {
offset++;
}
return offset;
}

} // namespace details

using general_context = std::unique_ptr<pcre2_general_context_8, decltype(std::addressof(pcre2_general_context_free_8))>;
using compile_context = std::unique_ptr<pcre2_compile_context_8, decltype(std::addressof(pcre2_compile_context_free_8))>;
using match_context = std::unique_ptr<pcre2_match_context_8, decltype(std::addressof(pcre2_match_context_free_8))>;
using match_data = std::unique_ptr<pcre2_match_data_8, decltype(std::addressof(pcre2_match_data_free_8))>;
using code = std::unique_ptr<pcre2_code_8, decltype(std::addressof(pcre2_code_free_8))>;

struct error {
int32_t code{};
};

struct compile_error : kphp::pcre2::error {
size_t offset{};
};

struct group_name {
std::string_view name;
size_t index{};
};

class regex {
kphp::pcre2::code m_code;

class group_name_iterator {
const PCRE2_UCHAR8* m_ptr{nullptr};
const size_t m_entry_size{};

public:
using iterator_category = std::forward_iterator_tag;
using value_type = kphp::pcre2::group_name;
using difference_type = std::ptrdiff_t;
using pointer = kphp::pcre2::group_name*;
using reference = kphp::pcre2::group_name;

group_name_iterator() = delete;
group_name_iterator(const PCRE2_UCHAR8* current_entry, size_t entry_size) noexcept
: m_ptr{current_entry},
m_entry_size{entry_size} {
kphp::log::assertion(current_entry != nullptr);
}

kphp::pcre2::group_name operator*() const noexcept {
static constexpr size_t UPPER = 0;
static constexpr size_t LOWER = 1;

const auto index{static_cast<size_t>(m_ptr[UPPER] << 8 | m_ptr[LOWER])};
const auto* name_ptr{reinterpret_cast<const char*>(std::next(m_ptr, 2 * sizeof(PCRE2_UCHAR8)))};
return {.name = std::string_view{name_ptr}, .index = index};
}

group_name_iterator& operator++() noexcept {
std::advance(m_ptr, m_entry_size);
return *this;
}

group_name_iterator operator++(int) noexcept { // NOLINT
group_name_iterator tmp{*this};
++*this;
return tmp;
}

bool operator==(const group_name_iterator& other) const noexcept {
return m_ptr == other.m_ptr;
}
};

public:
friend class match_view;
friend class matcher;

static std::expected<regex, kphp::pcre2::compile_error> compile(std::string_view pattern, kphp::pcre2::compile_context& ctx, uint32_t options = 0) noexcept {
int32_t errorcode{};
PCRE2_SIZE erroroffset{};

kphp::pcre2::code re{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.length(), options, std::addressof(errorcode),
std::addressof(erroroffset), ctx.get()),
pcre2_code_free_8};

if (!re) {
return std::unexpected{kphp::pcre2::compile_error{{.code = errorcode}, erroroffset}};
}
return kphp::pcre2::regex{std::move(re)};
}

struct group_name_range {
group_name_iterator b;
group_name_iterator e;

group_name_iterator begin() const noexcept {
return b;
}
group_name_iterator end() const noexcept {
return e;
}

bool empty() const noexcept {
return b == e;
}
};

group_name_range group_names() const noexcept {
uint32_t count{};
uint32_t entry_size{};
PCRE2_SPTR8 table{};

kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0);

if (count == 0) {
return {.b = group_name_iterator{nullptr, 0}, .e = group_name_iterator{nullptr, 0}};
}

kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(entry_size)) == 0 &&
pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMETABLE, std::addressof(table)) == 0);

return {.b = group_name_iterator{table, entry_size}, .e = group_name_iterator{std::next(table, static_cast<size_t>(count) * entry_size), entry_size}};
}

uint32_t capture_count() const noexcept {
uint32_t count{};
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_CAPTURECOUNT, std::addressof(count)) == 0);
return count;
}

uint32_t name_count() const noexcept {
uint32_t count{};
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0);
return count;
}

bool is_utf() const noexcept {
uint32_t compile_options{};
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_ARGOPTIONS, std::addressof(compile_options)) == 0);
return (compile_options & PCRE2_UTF) != 0;
}

private:
explicit regex(kphp::pcre2::code&& code) noexcept
: m_code{std::move(code)} {}
};

class match_view {
const kphp::pcre2::regex& m_re;
std::string_view m_subject;
kphp::pcre2::match_data& m_match_data;
uint32_t m_match_options{};
size_t m_num_groups{};

public:
match_view(const regex& re, std::string_view subject, kphp::pcre2::match_data& match_data, uint32_t match_options, size_t num_groups) noexcept
: m_re{re},
m_subject{subject},
m_match_data{match_data},
m_match_options{match_options},
m_num_groups{num_groups} {}

int32_t size() const noexcept {
return m_num_groups;
}

struct offset_range {
size_t start{};
size_t end{};
};

std::optional<std::string_view> get_group(size_t i) const noexcept {
if (auto range{get_range(i)}; range.has_value()) {
return m_subject.substr(range->start, range->end - range->start);
}
return std::nullopt;
}

struct group_content {
std::string_view text;
size_t offset{};
};

std::optional<group_content> get_group_content(size_t i) const noexcept {
if (auto range{get_range(i)}; range.has_value()) {
return group_content{.text = m_subject.substr(range->start, range->end - range->start), .offset = range->start};
}
return std::nullopt;
}

size_t match_start() const noexcept {
return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::START];
}
size_t match_end() const noexcept {
return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::END];
}

std::expected<size_t, std::pair<size_t, kphp::pcre2::error>> substitute(std::string_view replacement, std::span<char> buffer,
kphp::pcre2::match_context& ctx) const noexcept {
kphp::log::assertion(buffer.data() != nullptr);

uint32_t substitute_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_MATCHED | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | m_match_options};

auto buffer_len{buffer.size()};
auto ret_code{pcre2_substitute_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), 0, substitute_options,
m_match_data.get(), ctx.get(), reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.length(),
reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), std::addressof(buffer_len))};

if (ret_code < 0) {
return std::unexpected<std::pair<size_t, kphp::pcre2::error>>{{buffer_len, {.code = ret_code}}};
}

return buffer_len;
}

private:
std::optional<offset_range> get_range(size_t i) const noexcept {
if (i >= m_num_groups) {
return std::nullopt;
}

const auto* ovector_ptr{pcre2_get_ovector_pointer_8(m_match_data.get())};
// ovector is an array of offset pairs
PCRE2_SIZE start{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::START]};
PCRE2_SIZE end{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::END]};

if (start == PCRE2_UNSET) {
return std::nullopt;
}
return offset_range{.start = start, .end = end};
}
};

class matcher {
const kphp::pcre2::regex& m_re;
std::string_view m_subject;
kphp::pcre2::match_context& m_ctx;
PCRE2_SIZE m_current_offset{};
kphp::pcre2::match_data& m_match_data;
uint32_t m_user_options{};
uint32_t m_match_options{};
bool m_is_utf{false};

public:
matcher(const kphp::pcre2::regex& re, std::string_view subject, size_t match_from, kphp::pcre2::match_context& ctx, kphp::pcre2::match_data& data,
uint32_t options = 0) noexcept
: m_re{re},
m_subject{subject},
m_ctx{ctx},
m_current_offset{match_from},
m_match_data{data},
m_user_options{options},
m_is_utf{re.is_utf()} {}

std::expected<std::optional<kphp::pcre2::match_view>, kphp::pcre2::error> next() noexcept {
while (m_current_offset <= m_subject.length()) {
uint32_t current_attempt_options{m_user_options | m_match_options};

auto ret_code{pcre2_match_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), m_current_offset,
current_attempt_options, m_match_data.get(), m_ctx.get())};

if (ret_code == PCRE2_ERROR_NOMATCH) {
if (m_match_options != 0) {
// If the anchored non-empty match failed, advance 1 unit and try again
m_match_options = 0;
m_current_offset++;
if (m_is_utf) {
m_current_offset = kphp::pcre2::details::skip_utf8_subsequent_bytes(m_current_offset, m_subject);
}
continue;
}
return std::nullopt;
}

// From https://www.pcre.org/current/doc/html/pcre2_match.html
// The return from pcre2_match() is one more than the highest numbered capturing pair that has been set
// (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors.
if (ret_code < 0) [[unlikely]] {
return std::unexpected{error{.code = ret_code}};
}

size_t matched_groups_count{};
if (ret_code == 0) {
matched_groups_count = pcre2_get_ovector_count_8(m_match_data.get());
} else {
matched_groups_count = static_cast<size_t>(ret_code);
}

const PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data.get())};

size_t start{ovector[kphp::pcre2::details::offset_pair::START]};
size_t end{ovector[kphp::pcre2::details::offset_pair::END]};

if (start == end) {
// Found an empty match; set flags to try finding a non-empty match at same position
m_match_options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
} else {
m_match_options = 0;
}
m_current_offset = end;

return kphp::pcre2::match_view{m_re, m_subject, m_match_data, current_attempt_options, matched_groups_count};
}

return std::nullopt;
}
};

} // namespace kphp::pcre2

template<>
struct std::formatter<kphp::pcre2::error> {
template<typename ParseContext>
constexpr auto parse(ParseContext& ctx) const noexcept {
return ctx.begin();
}

template<typename FmtContext>
auto format(kphp::pcre2::error error, FmtContext& ctx) const noexcept {
static constexpr size_t ERROR_BUFFER_LENGTH{256};

std::array<char, ERROR_BUFFER_LENGTH> buffer; // NOLINT
auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size())};
if (ret_code < 0) [[unlikely]] {
switch (ret_code) {
case PCRE2_ERROR_BADDATA:
return format_to(ctx.out(), "unknown error ({})", error.code);
case PCRE2_ERROR_NOMEMORY:
return format_to(ctx.out(), "[truncated] {}", buffer.data());
default:
kphp::log::error("unsupported regex error code: {}", ret_code);
}
}
return format_to(ctx.out(), "{}", buffer.data());
}
};
Loading
Loading