diff --git a/include/boost/corosio/native/detail/epoll/epoll_acceptor.hpp b/include/boost/corosio/native/detail/epoll/epoll_acceptor.hpp index 5e502aaf..b9d23c14 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_acceptor.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_acceptor.hpp @@ -14,13 +14,9 @@ #if BOOST_COROSIO_HAS_EPOLL -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { @@ -28,9 +24,12 @@ class epoll_acceptor_service; /// Acceptor implementation for epoll backend. class epoll_acceptor final - : public tcp_acceptor::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_acceptor< + epoll_acceptor, + epoll_acceptor_service, + epoll_op, + epoll_accept_op, + descriptor_state> { friend class epoll_acceptor_service; @@ -44,47 +43,8 @@ class epoll_acceptor final std::error_code*, io_object::implementation**) override; - int native_handle() const noexcept - { - return fd_; - } - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - bool is_open() const noexcept override - { - return fd_ >= 0; - } void cancel() noexcept override; - - std::error_code set_option( - int level, - int optname, - void const* data, - std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - void cancel_single_op(epoll_op& op) noexcept; void close_socket() noexcept; - void set_local_endpoint(endpoint ep) noexcept - { - local_endpoint_ = ep; - } - - epoll_acceptor_service& service() noexcept - { - return svc_; - } - - epoll_accept_op acc_; - descriptor_state desc_state_; - -private: - epoll_acceptor_service& svc_; - int fd_ = -1; - endpoint local_endpoint_; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/epoll/epoll_acceptor_service.hpp b/include/boost/corosio/native/detail/epoll/epoll_acceptor_service.hpp index 72e5df56..3c8276f6 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_acceptor_service.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_acceptor_service.hpp @@ -21,14 +21,12 @@ #include #include #include +#include -#include -#include -#include +#include #include #include -#include #include #include @@ -39,21 +37,9 @@ namespace boost::corosio::detail { -/** State for epoll acceptor service. */ -class epoll_acceptor_state -{ -public: - explicit epoll_acceptor_state(epoll_scheduler& sched) noexcept - : sched_(sched) - { - } - - epoll_scheduler& sched_; - std::mutex mutex_; - intrusive_list acceptor_list_; - std::unordered_map> - acceptor_ptrs_; -}; +/// State for epoll acceptor service. +using epoll_acceptor_state = + reactor_service_state; /** epoll acceptor service implementation. @@ -88,7 +74,7 @@ class BOOST_COROSIO_DECL epoll_acceptor_service final : public acceptor_service { return state_->sched_; } - void post(epoll_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -100,12 +86,6 @@ class BOOST_COROSIO_DECL epoll_acceptor_service final : public acceptor_service std::unique_ptr state_; }; -//-------------------------------------------------------------------------- -// -// Implementation -// -//-------------------------------------------------------------------------- - inline void epoll_accept_op::cancel() noexcept { @@ -118,79 +98,11 @@ epoll_accept_op::cancel() noexcept inline void epoll_accept_op::operator()() { - stop_cb.reset(); - - static_cast(acceptor_impl_) - ->service() - .scheduler() - .reset_inline_budget(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - - // Set up the peer socket on success - if (success && accepted_fd >= 0 && acceptor_impl_) - { - auto* socket_svc = static_cast(acceptor_impl_) - ->service() - .socket_service(); - if (socket_svc) - { - auto& impl = static_cast(*socket_svc->construct()); - impl.set_socket(accepted_fd); - - impl.desc_state_.fd = accepted_fd; - { - std::lock_guard lock(impl.desc_state_.mutex); - impl.desc_state_.read_op = nullptr; - impl.desc_state_.write_op = nullptr; - impl.desc_state_.connect_op = nullptr; - } - socket_svc->scheduler().register_descriptor( - accepted_fd, &impl.desc_state_); - - impl.set_endpoints( - static_cast(acceptor_impl_)->local_endpoint(), - from_sockaddr(peer_storage)); - - if (impl_out) - *impl_out = &impl; - accepted_fd = -1; - } - else - { - // No socket service — treat as error - *ec_out = make_err(ENOENT); - success = false; - } - } - - if (!success || !acceptor_impl_) - { - if (accepted_fd >= 0) - { - ::close(accepted_fd); - accepted_fd = -1; - } - if (impl_out) - *impl_out = nullptr; - } - - // Move to stack before resuming. See epoll_op::operator()() for rationale. - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_accept_op(*this); } inline epoll_acceptor::epoll_acceptor(epoll_acceptor_service& svc) noexcept - : svc_(svc) + : reactor_acceptor(svc) { } @@ -311,71 +223,13 @@ epoll_acceptor::accept( inline void epoll_acceptor::cancel() noexcept { - cancel_single_op(acc_); -} - -inline void -epoll_acceptor::cancel_single_op(epoll_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - op.request_cancel(); - - epoll_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_op == &op) - claimed = std::exchange(desc_state_.read_op, nullptr); - } - if (claimed) - { - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } + do_cancel(); } inline void epoll_acceptor::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - acc_.request_cancel(); - - epoll_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - claimed = std::exchange(desc_state_.read_op, nullptr); - desc_state_.read_ready = false; - desc_state_.write_ready = false; - } - - if (claimed) - { - acc_.impl_ptr = self; - svc_.post(&acc_); - svc_.work_finished(); - } - - if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) - desc_state_.impl_ref_ = self; - } - - if (fd_ >= 0) - { - if (desc_state_.registered_events != 0) - svc_.scheduler().deregister_descriptor(fd_); - ::close(fd_); - fd_ = -1; - } - - desc_state_.fd = -1; - desc_state_.registered_events = 0; - - local_endpoint_ = endpoint{}; + do_close_socket(); } inline epoll_acceptor_service::epoll_acceptor_service( @@ -394,10 +248,10 @@ epoll_acceptor_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->acceptor_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) impl->close_socket(); - // Don't clear acceptor_ptrs_ here — same rationale as + // Don't clear impl_ptrs_ here — same rationale as // epoll_socket_service::shutdown(). Let ~state_ release ptrs // after scheduler shutdown has drained all queued ops. } @@ -409,8 +263,8 @@ epoll_acceptor_service::construct() auto* raw = impl.get(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.push_back(raw); - state_->acceptor_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); return raw; } @@ -421,8 +275,8 @@ epoll_acceptor_service::destroy(io_object::implementation* impl) auto* epoll_impl = static_cast(impl); epoll_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.remove(epoll_impl); - state_->acceptor_ptrs_.erase(epoll_impl); + state_->impl_list_.remove(epoll_impl); + state_->impl_ptrs_.erase(epoll_impl); } inline void @@ -431,27 +285,6 @@ epoll_acceptor_service::close(io_object::handle& h) static_cast(h.get())->close_socket(); } -inline std::error_code -epoll_acceptor::set_option( - int level, int optname, void const* data, std::size_t size) noexcept -{ - if (::setsockopt(fd_, level, optname, data, static_cast(size)) != - 0) - return make_err(errno); - return {}; -} - -inline std::error_code -epoll_acceptor::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; -} - inline std::error_code epoll_acceptor_service::open_acceptor_socket( tcp_acceptor::implementation& impl, int family, int type, int protocol) @@ -485,41 +318,18 @@ inline std::error_code epoll_acceptor_service::bind_acceptor( tcp_acceptor::implementation& impl, endpoint ep) { - auto* epoll_impl = static_cast(&impl); - int fd = epoll_impl->fd_; - - sockaddr_storage storage{}; - socklen_t addrlen = detail::to_sockaddr(ep, storage); - if (::bind(fd, reinterpret_cast(&storage), addrlen) < 0) - return make_err(errno); - - // Cache local endpoint (resolves ephemeral port) - sockaddr_storage local{}; - socklen_t local_len = sizeof(local); - if (::getsockname(fd, reinterpret_cast(&local), &local_len) == 0) - epoll_impl->set_local_endpoint(detail::from_sockaddr(local)); - - return {}; + return static_cast(&impl)->do_bind(ep); } inline std::error_code epoll_acceptor_service::listen_acceptor( tcp_acceptor::implementation& impl, int backlog) { - auto* epoll_impl = static_cast(&impl); - int fd = epoll_impl->fd_; - - if (::listen(fd, backlog) < 0) - return make_err(errno); - - // Register fd with epoll (edge-triggered mode) - scheduler().register_descriptor(fd, &epoll_impl->desc_state_); - - return {}; + return static_cast(&impl)->do_listen(backlog); } inline void -epoll_acceptor_service::post(epoll_op* op) +epoll_acceptor_service::post(scheduler_op* op) { state_->sched_.post(op); } diff --git a/include/boost/corosio/native/detail/epoll/epoll_op.hpp b/include/boost/corosio/native/detail/epoll/epoll_op.hpp index 1a67a2be..f2a43970 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_op.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_op.hpp @@ -14,32 +14,8 @@ #if BOOST_COROSIO_HAS_EPOLL -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include +#include +#include /* epoll Operation State @@ -86,252 +62,37 @@ struct epoll_op; // Forward declaration class epoll_scheduler; -/** Per-descriptor state for persistent epoll registration. - - Tracks pending operations for a file descriptor. The fd is registered - once with epoll and stays registered until closed. - - This struct extends scheduler_op to support deferred I/O processing. - When epoll events arrive, the reactor sets ready_events and queues - this descriptor for processing. When popped from the scheduler queue, - operator() performs the actual I/O and queues completion handlers. - - @par Deferred I/O Model - The reactor no longer performs I/O directly. Instead: - 1. Reactor sets ready_events and queues descriptor_state - 2. Scheduler pops descriptor_state and calls operator() - 3. operator() performs I/O under mutex and queues completions +/// Per-descriptor state for persistent epoll registration. +struct descriptor_state final : reactor_descriptor_state +{}; - This eliminates per-descriptor mutex locking from the reactor hot path. - - @par Thread Safety - The mutex protects operation pointers and ready flags during I/O. - ready_events_ and is_enqueued_ are atomic for lock-free reactor access. -*/ -struct descriptor_state final : scheduler_op +/// epoll base operation — thin wrapper over reactor_op. +struct epoll_op : reactor_op { - std::mutex mutex; - - // Protected by mutex - epoll_op* read_op = nullptr; - epoll_op* write_op = nullptr; - epoll_op* connect_op = nullptr; - - // Caches edge events that arrived before an op was registered - bool read_ready = false; - bool write_ready = false; - - // Deferred cancellation: set by cancel() when the target op is not - // parked (e.g. completing inline via speculative I/O). Checked when - // the next op parks; if set, the op is immediately self-cancelled. - // This matches IOCP semantics where CancelIoEx always succeeds. - bool read_cancel_pending = false; - bool write_cancel_pending = false; - bool connect_cancel_pending = false; - - // Set during registration only (no mutex needed) - std::uint32_t registered_events = 0; - int fd = -1; - - // For deferred I/O - set by reactor, read by scheduler - std::atomic ready_events_{0}; - std::atomic is_enqueued_{false}; - epoll_scheduler const* scheduler_ = nullptr; - - // Prevents impl destruction while this descriptor_state is queued. - // Set by close_socket() when is_enqueued_ is true, cleared by operator(). - std::shared_ptr impl_ref_; - - /// Add ready events atomically. - void add_ready_events(std::uint32_t ev) noexcept - { - ready_events_.fetch_or(ev, std::memory_order_relaxed); - } - - /// Perform deferred I/O and queue completions. void operator()() override; - - /// Destroy without invoking. - /// Called during scheduler::shutdown() drain. Clear impl_ref_ to break - /// the self-referential cycle set by close_socket(). - void destroy() override - { - impl_ref_.reset(); - } }; -struct epoll_op : scheduler_op +/// epoll connect operation. +struct epoll_connect_op final : reactor_connect_op { - struct canceller - { - epoll_op* op; - void operator()() const noexcept; - }; - - std::coroutine_handle<> h; - capy::executor_ref ex; - std::error_code* ec_out = nullptr; - std::size_t* bytes_out = nullptr; - - int fd = -1; - int errn = 0; - std::size_t bytes_transferred = 0; - - std::atomic cancelled{false}; - std::optional> stop_cb; - - // Prevents use-after-free when socket is closed with pending ops. - // See "Impl Lifetime Management" in file header. - std::shared_ptr impl_ptr; - - // For stop_token cancellation - pointer to owning socket/acceptor impl. - // When stop is requested, we call back to the impl to perform actual I/O cancellation. - epoll_socket* socket_impl_ = nullptr; - epoll_acceptor* acceptor_impl_ = nullptr; - - epoll_op() = default; - - void reset() noexcept - { - fd = -1; - errn = 0; - bytes_transferred = 0; - cancelled.store(false, std::memory_order_relaxed); - impl_ptr.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = nullptr; - } - - // Defined in sockets.cpp where epoll_socket is complete - void operator()() override; - - virtual bool is_read_operation() const noexcept - { - return false; - } - virtual void cancel() noexcept = 0; - - void destroy() override - { - stop_cb.reset(); - impl_ptr.reset(); - } - - void request_cancel() noexcept - { - cancelled.store(true, std::memory_order_release); - } - - void start(std::stop_token const& token, epoll_socket* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = impl; - acceptor_impl_ = nullptr; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void start(std::stop_token const& token, epoll_acceptor* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = impl; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void complete(int err, std::size_t bytes) noexcept - { - errn = err; - bytes_transferred = bytes; - } - - virtual void perform_io() noexcept {} -}; - -struct epoll_connect_op final : epoll_op -{ - endpoint target_endpoint; - - void reset() noexcept - { - epoll_op::reset(); - target_endpoint = endpoint{}; - } - - void perform_io() noexcept override - { - // connect() completion status is retrieved via SO_ERROR, not return value - int err = 0; - socklen_t len = sizeof(err); - if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) - err = errno; - complete(err, 0); - } - - // Defined in sockets.cpp where epoll_socket is complete void operator()() override; void cancel() noexcept override; }; -struct epoll_read_op final : epoll_op +/// epoll scatter-read operation. +struct epoll_read_op final : reactor_read_op { - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - bool empty_buffer_read = false; - - bool is_read_operation() const noexcept override - { - return !empty_buffer_read; - } - - void reset() noexcept - { - epoll_op::reset(); - iovec_count = 0; - empty_buffer_read = false; - } - - void perform_io() noexcept override - { - ssize_t n; - do - { - n = ::readv(fd, iovecs, iovec_count); - } - while (n < 0 && errno == EINTR); - - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); - } - void cancel() noexcept override; }; -struct epoll_write_op final : epoll_op +/** Provides sendmsg(MSG_NOSIGNAL) with EINTR retry for epoll writes. */ +struct epoll_write_policy { - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - - void reset() noexcept - { - epoll_op::reset(); - iovec_count = 0; - } - - void perform_io() noexcept override + static ssize_t write(int fd, iovec* iovecs, int count) noexcept { msghdr msg{}; msg.msg_iov = iovecs; - msg.msg_iovlen = static_cast(iovec_count); + msg.msg_iovlen = static_cast(count); ssize_t n; do @@ -339,54 +100,37 @@ struct epoll_write_op final : epoll_op n = ::sendmsg(fd, &msg, MSG_NOSIGNAL); } while (n < 0 && errno == EINTR); - - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); + return n; } +}; +/// epoll gather-write operation. +struct epoll_write_op final : reactor_write_op +{ void cancel() noexcept override; }; -struct epoll_accept_op final : epoll_op +/** Provides accept4(SOCK_NONBLOCK|SOCK_CLOEXEC) with EINTR retry. */ +struct epoll_accept_policy { - int accepted_fd = -1; - io_object::implementation** impl_out = nullptr; - sockaddr_storage peer_storage{}; - - void reset() noexcept - { - epoll_op::reset(); - accepted_fd = -1; - impl_out = nullptr; - peer_storage = {}; - } - - void perform_io() noexcept override + static int do_accept(int fd, sockaddr_storage& peer) noexcept { - socklen_t addrlen = sizeof(peer_storage); + socklen_t addrlen = sizeof(peer); int new_fd; do { new_fd = ::accept4( - fd, reinterpret_cast(&peer_storage), &addrlen, + fd, reinterpret_cast(&peer), &addrlen, SOCK_NONBLOCK | SOCK_CLOEXEC); } while (new_fd < 0 && errno == EINTR); - - if (new_fd >= 0) - { - accepted_fd = new_fd; - complete(0, 0); - } - else - { - complete(errno, 0); - } + return new_fd; } +}; - // Defined in acceptors.cpp where epoll_acceptor is complete +/// epoll accept operation. +struct epoll_accept_op final : reactor_accept_op +{ void operator()() override; void cancel() noexcept override; }; diff --git a/include/boost/corosio/native/detail/epoll/epoll_scheduler.hpp b/include/boost/corosio/native/detail/epoll/epoll_scheduler.hpp index 63ebb0e9..c17ad595 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_scheduler.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_scheduler.hpp @@ -17,8 +17,7 @@ #include #include -#include -#include +#include #include #include @@ -27,22 +26,15 @@ #include #include -#include #include #include -#include -#include #include -#include #include -#include #include -#include #include #include -#include #include #include @@ -50,9 +42,6 @@ namespace boost::corosio::detail { struct epoll_op; struct descriptor_state; -namespace epoll { -struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context; -} // namespace epoll /** Linux scheduler using epoll for I/O multiplexing. @@ -73,13 +62,9 @@ struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context; @par Thread Safety All public member functions are thread-safe. */ -class BOOST_COROSIO_DECL epoll_scheduler final - : public native_scheduler - , public capy::execution_context::service +class BOOST_COROSIO_DECL epoll_scheduler final : public reactor_scheduler_base { public: - using key_type = scheduler; - /** Construct the scheduler. Creates an epoll instance, eventfd for reactor interruption, @@ -96,18 +81,8 @@ class BOOST_COROSIO_DECL epoll_scheduler final epoll_scheduler(epoll_scheduler const&) = delete; epoll_scheduler& operator=(epoll_scheduler const&) = delete; + /// Shut down the scheduler, draining pending operations. void shutdown() override; - void post(std::coroutine_handle<> h) const override; - void post(scheduler_op* h) const override; - bool running_in_this_thread() const noexcept override; - void stop() override; - bool stopped() const noexcept override; - void restart() override; - std::size_t run() override; - std::size_t run_one() override; - std::size_t wait_one(long usec) override; - std::size_t poll() override; - std::size_t poll_one() override; /** Return the epoll file descriptor. @@ -121,19 +96,6 @@ class BOOST_COROSIO_DECL epoll_scheduler final return epoll_fd_; } - /** Reset the thread's inline completion budget. - - Called at the start of each posted completion handler to - grant a fresh budget for speculative inline completions. - */ - void reset_inline_budget() const noexcept; - - /** Consume one unit of inline budget if available. - - @return True if budget was available and consumed. - */ - bool try_consume_inline_budget() const noexcept; - /** Register a descriptor for persistent monitoring. The fd is registered once and stays registered until explicitly @@ -151,469 +113,27 @@ class BOOST_COROSIO_DECL epoll_scheduler final */ void deregister_descriptor(int fd) const; - void work_started() noexcept override; - void work_finished() noexcept override; - - /** Offset a forthcoming work_finished from work_cleanup. - - Called by descriptor_state when all I/O returned EAGAIN and no - handler will be executed. Must be called from a scheduler thread. - */ - void compensating_work_started() const noexcept; - - /** Drain work from thread context's private queue to global queue. - - Called by thread_context_guard destructor when a thread exits run(). - Transfers pending work to the global queue under mutex protection. - - @param queue The private queue to drain. - @param count Item count for wakeup decisions (wakes other threads if positive). - */ - void drain_thread_queue(op_queue& queue, long count) const; - - /** Post completed operations for deferred invocation. - - If called from a thread running this scheduler, operations go to - the thread's private queue (fast path). Otherwise, operations are - added to the global queue under mutex and a waiter is signaled. - - @par Preconditions - work_started() must have been called for each operation. - - @param ops Queue of operations to post. - */ - void post_deferred_completions(op_queue& ops) const; - private: - struct work_cleanup - { - epoll_scheduler* scheduler; - std::unique_lock* lock; - epoll::scheduler_context* ctx; - ~work_cleanup(); - }; - - struct task_cleanup - { - epoll_scheduler const* scheduler; - std::unique_lock* lock; - epoll::scheduler_context* ctx; - ~task_cleanup(); - }; - - std::size_t do_one( - std::unique_lock& lock, - long timeout_us, - epoll::scheduler_context* ctx); void - run_task(std::unique_lock& lock, epoll::scheduler_context* ctx); - void wake_one_thread_and_unlock(std::unique_lock& lock) const; - void interrupt_reactor() const; + run_task(std::unique_lock& lock, context_type* ctx) override; + void interrupt_reactor() const override; void update_timerfd() const; - /** Set the signaled state and wake all waiting threads. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - */ - void signal_all(std::unique_lock& lock) const; - - /** Set the signaled state and wake one waiter if any exist. - - Only unlocks and signals if at least one thread is waiting. - Use this when the caller needs to perform a fallback action - (such as interrupting the reactor) when no waiters exist. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - - @return `true` if unlocked and signaled, `false` if lock still held. - */ - bool maybe_unlock_and_signal_one(std::unique_lock& lock) const; - - /** Set the signaled state, unlock, and wake one waiter if any exist. - - Always unlocks the mutex. Use this when the caller will release - the lock regardless of whether a waiter exists. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - - @return `true` if a waiter was signaled, `false` otherwise. - */ - bool unlock_and_signal_one(std::unique_lock& lock) const; - - /** Clear the signaled state before waiting. - - @par Preconditions - Mutex must be held. - */ - void clear_signal() const; - - /** Block until the signaled state is set. - - Returns immediately if already signaled (fast-path). Otherwise - increments the waiter count, waits on the condition variable, - and decrements the waiter count upon waking. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - */ - void wait_for_signal(std::unique_lock& lock) const; - - /** Block until signaled or timeout expires. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - @param timeout_us Maximum time to wait in microseconds. - */ - void wait_for_signal_for( - std::unique_lock& lock, long timeout_us) const; - int epoll_fd_; - int event_fd_; // for interrupting reactor - int timer_fd_; // timerfd for kernel-managed timer expiry - mutable std::mutex mutex_; - mutable std::condition_variable cond_; - mutable op_queue completed_ops_; - mutable std::atomic outstanding_work_; - bool stopped_; - - // True while a thread is blocked in epoll_wait. Used by - // wake_one_thread_and_unlock and work_finished to know when - // an eventfd interrupt is needed instead of a condvar signal. - mutable std::atomic task_running_{false}; - - // True when the reactor has been told to do a non-blocking poll - // (more handlers queued or poll mode). Prevents redundant eventfd - // writes and controls the epoll_wait timeout. - mutable bool task_interrupted_ = false; - - // Signaling state: bit 0 = signaled, upper bits = waiter count (incremented by 2) - mutable std::size_t state_ = 0; + int event_fd_; + int timer_fd_; // Edge-triggered eventfd state mutable std::atomic eventfd_armed_{false}; // Set when the earliest timer changes; flushed before epoll_wait - // blocks. Avoids timerfd_settime syscalls for timers that are - // scheduled then cancelled without being waited on. mutable std::atomic timerfd_stale_{false}; - - // Sentinel operation for interleaving reactor runs with handler execution. - // Ensures the reactor runs periodically even when handlers are continuously - // posted, preventing starvation of I/O events, timers, and signals. - struct task_op final : scheduler_op - { - void operator()() override {} - void destroy() override {} - }; - task_op task_op_; }; -//-------------------------------------------------------------------------- -// -// Implementation -// -//-------------------------------------------------------------------------- - -/* - epoll Scheduler - Single Reactor Model - ====================================== - - This scheduler uses a thread coordination strategy to provide handler - parallelism and avoid the thundering herd problem. - Instead of all threads blocking on epoll_wait(), one thread becomes the - "reactor" while others wait on a condition variable for handler work. - - Thread Model - ------------ - - ONE thread runs epoll_wait() at a time (the reactor thread) - - OTHER threads wait on cond_ (condition variable) for handlers - - When work is posted, exactly one waiting thread wakes via notify_one() - - This matches Windows IOCP semantics where N posted items wake N threads - - Event Loop Structure (do_one) - ----------------------------- - 1. Lock mutex, try to pop handler from queue - 2. If got handler: execute it (unlocked), return - 3. If queue empty and no reactor running: become reactor - - Run epoll_wait (unlocked), queue I/O completions, loop back - 4. If queue empty and reactor running: wait on condvar for work - - The task_running_ flag ensures only one thread owns epoll_wait(). - After the reactor queues I/O completions, it loops back to try getting - a handler, giving priority to handler execution over more I/O polling. - - Signaling State (state_) - ------------------------ - The state_ variable encodes two pieces of information: - - Bit 0: signaled flag (1 = signaled, persists until cleared) - - Upper bits: waiter count (each waiter adds 2 before blocking) - - This allows efficient coordination: - - Signalers only call notify when waiters exist (state_ > 1) - - Waiters check if already signaled before blocking (fast-path) - - Wake Coordination (wake_one_thread_and_unlock) - ---------------------------------------------- - When posting work: - - If waiters exist (state_ > 1): signal and notify_one() - - Else if reactor running: interrupt via eventfd write - - Else: no-op (thread will find work when it checks queue) - - This avoids waking threads unnecessarily. With cascading wakes, - each handler execution wakes at most one additional thread if - more work exists in the queue. - - Work Counting - ------------- - outstanding_work_ tracks pending operations. When it hits zero, run() - returns. Each operation increments on start, decrements on completion. - - Timer Integration - ----------------- - Timers are handled by timer_service. The reactor adjusts epoll_wait - timeout to wake for the nearest timer expiry. When a new timer is - scheduled earlier than current, timer_service calls interrupt_reactor() - to re-evaluate the timeout. -*/ - -namespace epoll { - -struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context -{ - epoll_scheduler const* key; - scheduler_context* next; - op_queue private_queue; - long private_outstanding_work; - int inline_budget; - int inline_budget_max; - bool unassisted; - - scheduler_context(epoll_scheduler const* k, scheduler_context* n) - : key(k) - , next(n) - , private_outstanding_work(0) - , inline_budget(0) - , inline_budget_max(2) - , unassisted(false) - { - } -}; - -inline thread_local_ptr context_stack; - -struct thread_context_guard -{ - scheduler_context frame_; - - explicit thread_context_guard(epoll_scheduler const* ctx) noexcept - : frame_(ctx, context_stack.get()) - { - context_stack.set(&frame_); - } - - ~thread_context_guard() noexcept - { - if (!frame_.private_queue.empty()) - frame_.key->drain_thread_queue( - frame_.private_queue, frame_.private_outstanding_work); - context_stack.set(frame_.next); - } -}; - -inline scheduler_context* -find_context(epoll_scheduler const* self) noexcept -{ - for (auto* c = context_stack.get(); c != nullptr; c = c->next) - if (c->key == self) - return c; - return nullptr; -} - -} // namespace epoll - -inline void -epoll_scheduler::reset_inline_budget() const noexcept -{ - if (auto* ctx = epoll::find_context(this)) - { - // Cap when no other thread absorbed queued work. A moderate - // cap (4) amortizes scheduling for small buffers while avoiding - // bursty I/O that fills socket buffers and stalls large transfers. - if (ctx->unassisted) - { - ctx->inline_budget_max = 4; - ctx->inline_budget = 4; - return; - } - // Ramp up when previous cycle fully consumed budget. - // Reset on partial consumption (EAGAIN hit or peer got scheduled). - if (ctx->inline_budget == 0) - ctx->inline_budget_max = (std::min)(ctx->inline_budget_max * 2, 16); - else if (ctx->inline_budget < ctx->inline_budget_max) - ctx->inline_budget_max = 2; - ctx->inline_budget = ctx->inline_budget_max; - } -} - -inline bool -epoll_scheduler::try_consume_inline_budget() const noexcept -{ - if (auto* ctx = epoll::find_context(this)) - { - if (ctx->inline_budget > 0) - { - --ctx->inline_budget; - return true; - } - } - return false; -} - -inline void -descriptor_state::operator()() -{ - is_enqueued_.store(false, std::memory_order_relaxed); - - // Take ownership of impl ref set by close_socket() to prevent - // the owning impl from being freed while we're executing - auto prevent_impl_destruction = std::move(impl_ref_); - - std::uint32_t ev = ready_events_.exchange(0, std::memory_order_acquire); - if (ev == 0) - { - scheduler_->compensating_work_started(); - return; - } - - op_queue local_ops; - - int err = 0; - if (ev & EPOLLERR) - { - socklen_t len = sizeof(err); - if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) - err = errno; - if (err == 0) - err = EIO; - } - - { - std::lock_guard lock(mutex); - if (ev & EPOLLIN) - { - if (read_op) - { - auto* rd = read_op; - if (err) - rd->complete(err, 0); - else - rd->perform_io(); - - if (rd->errn == EAGAIN || rd->errn == EWOULDBLOCK) - { - rd->errn = 0; - } - else - { - read_op = nullptr; - local_ops.push(rd); - } - } - else - { - read_ready = true; - } - } - if (ev & EPOLLOUT) - { - bool had_write_op = (connect_op || write_op); - if (connect_op) - { - auto* cn = connect_op; - if (err) - cn->complete(err, 0); - else - cn->perform_io(); - connect_op = nullptr; - local_ops.push(cn); - } - if (write_op) - { - auto* wr = write_op; - if (err) - wr->complete(err, 0); - else - wr->perform_io(); - - if (wr->errn == EAGAIN || wr->errn == EWOULDBLOCK) - { - wr->errn = 0; - } - else - { - write_op = nullptr; - local_ops.push(wr); - } - } - if (!had_write_op) - write_ready = true; - } - if (err) - { - if (read_op) - { - read_op->complete(err, 0); - local_ops.push(std::exchange(read_op, nullptr)); - } - if (write_op) - { - write_op->complete(err, 0); - local_ops.push(std::exchange(write_op, nullptr)); - } - if (connect_op) - { - connect_op->complete(err, 0); - local_ops.push(std::exchange(connect_op, nullptr)); - } - } - } - - // Execute first handler inline — the scheduler's work_cleanup - // accounts for this as the "consumed" work item - scheduler_op* first = local_ops.pop(); - if (first) - { - scheduler_->post_deferred_completions(local_ops); - (*first)(); - } - else - { - scheduler_->compensating_work_started(); - } -} - inline epoll_scheduler::epoll_scheduler(capy::execution_context& ctx, int) : epoll_fd_(-1) , event_fd_(-1) , timer_fd_(-1) - , outstanding_work_(0) - , stopped_(false) - , task_running_{false} - , task_interrupted_(false) - , state_(0) { epoll_fd_ = ::epoll_create1(EPOLL_CLOEXEC); if (epoll_fd_ < 0) @@ -665,17 +185,12 @@ inline epoll_scheduler::epoll_scheduler(capy::execution_context& ctx, int) timer_service::callback(this, [](void* p) { auto* self = static_cast(p); self->timerfd_stale_.store(true, std::memory_order_release); - if (self->task_running_.load(std::memory_order_acquire)) - self->interrupt_reactor(); + self->interrupt_reactor(); })); - // Initialize resolver service get_resolver_service(ctx, *this); - - // Initialize signal service get_signal_service(ctx, *this); - // Push task sentinel to interleave reactor runs with handler execution completed_ops_.push(&task_op_); } @@ -692,217 +207,12 @@ inline epoll_scheduler::~epoll_scheduler() inline void epoll_scheduler::shutdown() { - { - std::unique_lock lock(mutex_); - - while (auto* h = completed_ops_.pop()) - { - if (h == &task_op_) - continue; - lock.unlock(); - h->destroy(); - lock.lock(); - } - - signal_all(lock); - } + shutdown_drain(); if (event_fd_ >= 0) interrupt_reactor(); } -inline void -epoll_scheduler::post(std::coroutine_handle<> h) const -{ - struct post_handler final : scheduler_op - { - std::coroutine_handle<> h_; - - explicit post_handler(std::coroutine_handle<> h) : h_(h) {} - - ~post_handler() override = default; - - void operator()() override - { - auto h = h_; - delete this; - h.resume(); - } - - void destroy() override - { - auto h = h_; - delete this; - h.destroy(); - } - }; - - auto ph = std::make_unique(h); - - // Fast path: same thread posts to private queue - // Only count locally; work_cleanup batches to global counter - if (auto* ctx = epoll::find_context(this)) - { - ++ctx->private_outstanding_work; - ctx->private_queue.push(ph.release()); - return; - } - - // Slow path: cross-thread post requires mutex - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(ph.release()); - wake_one_thread_and_unlock(lock); -} - -inline void -epoll_scheduler::post(scheduler_op* h) const -{ - // Fast path: same thread posts to private queue - // Only count locally; work_cleanup batches to global counter - if (auto* ctx = epoll::find_context(this)) - { - ++ctx->private_outstanding_work; - ctx->private_queue.push(h); - return; - } - - // Slow path: cross-thread post requires mutex - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(h); - wake_one_thread_and_unlock(lock); -} - -inline bool -epoll_scheduler::running_in_this_thread() const noexcept -{ - for (auto* c = epoll::context_stack.get(); c != nullptr; c = c->next) - if (c->key == this) - return true; - return false; -} - -inline void -epoll_scheduler::stop() -{ - std::unique_lock lock(mutex_); - if (!stopped_) - { - stopped_ = true; - signal_all(lock); - interrupt_reactor(); - } -} - -inline bool -epoll_scheduler::stopped() const noexcept -{ - std::unique_lock lock(mutex_); - return stopped_; -} - -inline void -epoll_scheduler::restart() -{ - std::unique_lock lock(mutex_); - stopped_ = false; -} - -inline std::size_t -epoll_scheduler::run() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - epoll::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - - std::size_t n = 0; - for (;;) - { - if (!do_one(lock, -1, &ctx.frame_)) - break; - if (n != (std::numeric_limits::max)()) - ++n; - if (!lock.owns_lock()) - lock.lock(); - } - return n; -} - -inline std::size_t -epoll_scheduler::run_one() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - epoll::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, -1, &ctx.frame_); -} - -inline std::size_t -epoll_scheduler::wait_one(long usec) -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - epoll::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, usec, &ctx.frame_); -} - -inline std::size_t -epoll_scheduler::poll() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - epoll::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - - std::size_t n = 0; - for (;;) - { - if (!do_one(lock, 0, &ctx.frame_)) - break; - if (n != (std::numeric_limits::max)()) - ++n; - if (!lock.owns_lock()) - lock.lock(); - } - return n; -} - -inline std::size_t -epoll_scheduler::poll_one() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - epoll::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, 0, &ctx.frame_); -} - inline void epoll_scheduler::register_descriptor(int fd, descriptor_state* desc) const { @@ -916,8 +226,10 @@ epoll_scheduler::register_descriptor(int fd, descriptor_state* desc) const desc->registered_events = ev.events; desc->fd = fd; desc->scheduler_ = this; + desc->ready_events_.store(0, std::memory_order_relaxed); std::lock_guard lock(desc->mutex); + desc->impl_ref_.reset(); desc->read_ready = false; desc->write_ready = false; } @@ -928,60 +240,9 @@ epoll_scheduler::deregister_descriptor(int fd) const ::epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, nullptr); } -inline void -epoll_scheduler::work_started() noexcept -{ - outstanding_work_.fetch_add(1, std::memory_order_relaxed); -} - -inline void -epoll_scheduler::work_finished() noexcept -{ - if (outstanding_work_.fetch_sub(1, std::memory_order_acq_rel) == 1) - stop(); -} - -inline void -epoll_scheduler::compensating_work_started() const noexcept -{ - auto* ctx = epoll::find_context(this); - if (ctx) - ++ctx->private_outstanding_work; -} - -inline void -epoll_scheduler::drain_thread_queue(op_queue& queue, long count) const -{ - // Note: outstanding_work_ was already incremented when posting - std::unique_lock lock(mutex_); - completed_ops_.splice(queue); - if (count > 0) - maybe_unlock_and_signal_one(lock); -} - -inline void -epoll_scheduler::post_deferred_completions(op_queue& ops) const -{ - if (ops.empty()) - return; - - // Fast path: if on scheduler thread, use private queue - if (auto* ctx = epoll::find_context(this)) - { - ctx->private_queue.splice(ops); - return; - } - - // Slow path: add to global queue and wake a thread - std::unique_lock lock(mutex_); - completed_ops_.splice(ops); - wake_one_thread_and_unlock(lock); -} - inline void epoll_scheduler::interrupt_reactor() const { - // Only write if not already armed to avoid redundant writes bool expected = false; if (eventfd_armed_.compare_exchange_strong( expected, true, std::memory_order_release, @@ -992,130 +253,6 @@ epoll_scheduler::interrupt_reactor() const } } -inline void -epoll_scheduler::signal_all(std::unique_lock&) const -{ - state_ |= 1; - cond_.notify_all(); -} - -inline bool -epoll_scheduler::maybe_unlock_and_signal_one( - std::unique_lock& lock) const -{ - state_ |= 1; - if (state_ > 1) - { - lock.unlock(); - cond_.notify_one(); - return true; - } - return false; -} - -inline bool -epoll_scheduler::unlock_and_signal_one(std::unique_lock& lock) const -{ - state_ |= 1; - bool have_waiters = state_ > 1; - lock.unlock(); - if (have_waiters) - cond_.notify_one(); - return have_waiters; -} - -inline void -epoll_scheduler::clear_signal() const -{ - state_ &= ~std::size_t(1); -} - -inline void -epoll_scheduler::wait_for_signal(std::unique_lock& lock) const -{ - while ((state_ & 1) == 0) - { - state_ += 2; - cond_.wait(lock); - state_ -= 2; - } -} - -inline void -epoll_scheduler::wait_for_signal_for( - std::unique_lock& lock, long timeout_us) const -{ - if ((state_ & 1) == 0) - { - state_ += 2; - cond_.wait_for(lock, std::chrono::microseconds(timeout_us)); - state_ -= 2; - } -} - -inline void -epoll_scheduler::wake_one_thread_and_unlock( - std::unique_lock& lock) const -{ - if (maybe_unlock_and_signal_one(lock)) - return; - - if (task_running_.load(std::memory_order_relaxed) && !task_interrupted_) - { - task_interrupted_ = true; - lock.unlock(); - interrupt_reactor(); - } - else - { - lock.unlock(); - } -} - -inline epoll_scheduler::work_cleanup::~work_cleanup() -{ - if (ctx) - { - long produced = ctx->private_outstanding_work; - if (produced > 1) - scheduler->outstanding_work_.fetch_add( - produced - 1, std::memory_order_relaxed); - else if (produced < 1) - scheduler->work_finished(); - ctx->private_outstanding_work = 0; - - if (!ctx->private_queue.empty()) - { - lock->lock(); - scheduler->completed_ops_.splice(ctx->private_queue); - } - } - else - { - scheduler->work_finished(); - } -} - -inline epoll_scheduler::task_cleanup::~task_cleanup() -{ - if (!ctx) - return; - - if (ctx->private_outstanding_work > 0) - { - scheduler->outstanding_work_.fetch_add( - ctx->private_outstanding_work, std::memory_order_relaxed); - ctx->private_outstanding_work = 0; - } - - if (!ctx->private_queue.empty()) - { - if (!lock->owns_lock()) - lock->lock(); - scheduler->completed_ops_.splice(ctx->private_queue); - } -} - inline void epoll_scheduler::update_timerfd() const { @@ -1126,14 +263,14 @@ epoll_scheduler::update_timerfd() const if (nearest == timer_service::time_point::max()) { - // No timers - disarm by setting to 0 (relative) + // No timers — disarm by setting to 0 (relative) } else { auto now = std::chrono::steady_clock::now(); if (nearest <= now) { - // Use 1ns instead of 0 - zero disarms the timerfd + // Use 1ns instead of 0 — zero disarms the timerfd ts.it_value.tv_nsec = 1; } else @@ -1143,7 +280,6 @@ epoll_scheduler::update_timerfd() const .count(); ts.it_value.tv_sec = nsec / 1000000000; ts.it_value.tv_nsec = nsec % 1000000000; - // Ensure non-zero to avoid disarming if duration rounds to 0 if (ts.it_value.tv_sec == 0 && ts.it_value.tv_nsec == 0) ts.it_value.tv_nsec = 1; } @@ -1154,8 +290,7 @@ epoll_scheduler::update_timerfd() const } inline void -epoll_scheduler::run_task( - std::unique_lock& lock, epoll::scheduler_context* ctx) +epoll_scheduler::run_task(std::unique_lock& lock, context_type* ctx) { int timeout_ms = task_interrupted_ ? 0 : -1; @@ -1168,7 +303,6 @@ epoll_scheduler::run_task( if (timerfd_stale_.exchange(false, std::memory_order_acquire)) update_timerfd(); - // Event loop runs without mutex held epoll_event events[128]; int nfds = ::epoll_wait(epoll_fd_, events, 128, timeout_ms); @@ -1178,13 +312,11 @@ epoll_scheduler::run_task( bool check_timers = false; op_queue local_ops; - // Process events without holding the mutex for (int i = 0; i < nfds; ++i) { if (events[i].data.ptr == nullptr) { std::uint64_t val; - // Mutex released above; analyzer can't track unlock via ref // NOLINTNEXTLINE(clang-analyzer-unix.BlockInCriticalSection) [[maybe_unused]] auto r = ::read(event_fd_, &val, sizeof(val)); eventfd_armed_.store(false, std::memory_order_relaxed); @@ -1201,12 +333,9 @@ epoll_scheduler::run_task( continue; } - // Deferred I/O: just set ready events and enqueue descriptor - // No per-descriptor mutex locking in reactor hot path! auto* desc = static_cast(events[i].data.ptr); desc->add_ready_events(events[i].events); - // Only enqueue if not already enqueued bool expected = false; if (desc->is_enqueued_.compare_exchange_strong( expected, true, std::memory_order_release, @@ -1216,7 +345,6 @@ epoll_scheduler::run_task( } } - // Process timers only when timerfd fires if (check_timers) { timer_svc_->process_expired(); @@ -1229,79 +357,6 @@ epoll_scheduler::run_task( completed_ops_.splice(local_ops); } -inline std::size_t -epoll_scheduler::do_one( - std::unique_lock& lock, - long timeout_us, - epoll::scheduler_context* ctx) -{ - for (;;) - { - if (stopped_) - return 0; - - scheduler_op* op = completed_ops_.pop(); - - // Handle reactor sentinel - time to poll for I/O - if (op == &task_op_) - { - bool more_handlers = !completed_ops_.empty(); - - // Nothing to run the reactor for: no pending work to wait on, - // or caller requested a non-blocking poll - if (!more_handlers && - (outstanding_work_.load(std::memory_order_acquire) == 0 || - timeout_us == 0)) - { - completed_ops_.push(&task_op_); - return 0; - } - - task_interrupted_ = more_handlers || timeout_us == 0; - task_running_.store(true, std::memory_order_release); - - if (more_handlers) - unlock_and_signal_one(lock); - - run_task(lock, ctx); - - task_running_.store(false, std::memory_order_relaxed); - completed_ops_.push(&task_op_); - continue; - } - - // Handle operation - if (op != nullptr) - { - bool more = !completed_ops_.empty(); - - if (more) - ctx->unassisted = !unlock_and_signal_one(lock); - else - { - ctx->unassisted = false; - lock.unlock(); - } - - work_cleanup on_exit{this, &lock, ctx}; - - (*op)(); - return 1; - } - - // No pending work to wait on, or caller requested non-blocking poll - if (outstanding_work_.load(std::memory_order_acquire) == 0 || - timeout_us == 0) - return 0; - - clear_signal(); - if (timeout_us < 0) - wait_for_signal(lock); - else - wait_for_signal_for(lock, timeout_us); - } -} - } // namespace boost::corosio::detail #endif // BOOST_COROSIO_HAS_EPOLL diff --git a/include/boost/corosio/native/detail/epoll/epoll_socket.hpp b/include/boost/corosio/native/detail/epoll/epoll_socket.hpp index b1c8a4d6..99d4b252 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_socket.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_socket.hpp @@ -14,13 +14,9 @@ #if BOOST_COROSIO_HAS_EPOLL -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { @@ -28,9 +24,14 @@ class epoll_socket_service; /// Socket implementation for epoll backend. class epoll_socket final - : public tcp_socket::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_socket< + epoll_socket, + epoll_socket_service, + epoll_op, + epoll_connect_op, + epoll_read_op, + epoll_write_op, + descriptor_state> { friend class epoll_socket_service; @@ -61,68 +62,8 @@ class epoll_socket final std::error_code*, std::size_t*) override; - std::error_code shutdown(tcp_socket::shutdown_type what) noexcept override; - - native_handle_type native_handle() const noexcept override - { - return fd_; - } - - std::error_code set_option( - int level, - int optname, - void const* data, - std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - endpoint remote_endpoint() const noexcept override - { - return remote_endpoint_; - } - bool is_open() const noexcept - { - return fd_ >= 0; - } void cancel() noexcept override; - void cancel_single_op(epoll_op& op) noexcept; void close_socket() noexcept; - void set_socket(int fd) noexcept - { - fd_ = fd; - } - void set_endpoints(endpoint local, endpoint remote) noexcept - { - local_endpoint_ = local; - remote_endpoint_ = remote; - } - - epoll_connect_op conn_; - epoll_read_op rd_; - epoll_write_op wr_; - - /// Per-descriptor state for persistent epoll registration - descriptor_state desc_state_; - -private: - epoll_socket_service& svc_; - int fd_ = -1; - endpoint local_endpoint_; - endpoint remote_endpoint_; - - void register_op( - epoll_op& op, - epoll_op*& desc_slot, - bool& ready_flag, - bool& cancel_flag) noexcept; - - friend struct epoll_op; - friend struct epoll_connect_op; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/epoll/epoll_socket_service.hpp b/include/boost/corosio/native/detail/epoll/epoll_socket_service.hpp index 707c6447..07638b83 100644 --- a/include/boost/corosio/native/detail/epoll/epoll_socket_service.hpp +++ b/include/boost/corosio/native/detail/epoll/epoll_socket_service.hpp @@ -20,16 +20,12 @@ #include #include +#include -#include -#include -#include -#include -#include +#include #include #include -#include #include #include @@ -70,7 +66,7 @@ Impl Lifetime with shared_ptr ----------------------------- Socket impls use enable_shared_from_this. The service owns impls via - shared_ptr maps (socket_ptrs_) keyed by raw pointer for O(1) lookup and + shared_ptr maps (impl_ptrs_) keyed by raw pointer for O(1) lookup and removal. When a user calls close(), we call cancel() which posts pending ops to the scheduler. @@ -90,20 +86,8 @@ namespace boost::corosio::detail { -/** State for epoll socket service. */ -class epoll_socket_state -{ -public: - explicit epoll_socket_state(epoll_scheduler& sched) noexcept : sched_(sched) - { - } - - epoll_scheduler& sched_; - std::mutex mutex_; - intrusive_list socket_list_; - std::unordered_map> - socket_ptrs_; -}; +/// State for epoll socket service. +using epoll_socket_state = reactor_service_state; /** epoll socket service implementation. @@ -134,7 +118,7 @@ class BOOST_COROSIO_DECL epoll_socket_service final : public socket_service { return state_->sched_; } - void post(epoll_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -142,57 +126,6 @@ class BOOST_COROSIO_DECL epoll_socket_service final : public socket_service std::unique_ptr state_; }; -//-------------------------------------------------------------------------- -// -// Implementation -// -//-------------------------------------------------------------------------- - -// Register an op with the reactor, handling cached edge events. -// Called under the EAGAIN/EINPROGRESS path when speculative I/O failed. -inline void -epoll_socket::register_op( - epoll_op& op, - epoll_op*& desc_slot, - bool& ready_flag, - bool& cancel_flag) noexcept -{ - svc_.work_started(); - - std::lock_guard lock(desc_state_.mutex); - bool io_done = false; - if (ready_flag) - { - ready_flag = false; - op.perform_io(); - io_done = (op.errn != EAGAIN && op.errn != EWOULDBLOCK); - if (!io_done) - op.errn = 0; - } - - if (cancel_flag) - { - cancel_flag = false; - op.cancelled.store(true, std::memory_order_relaxed); - } - - if (io_done || op.cancelled.load(std::memory_order_acquire)) - { - svc_.post(&op); - svc_.work_finished(); - } - else - { - desc_slot = &op; - } -} - -inline void -epoll_op::canceller::operator()() const noexcept -{ - op->cancel(); -} - inline void epoll_connect_op::cancel() noexcept { @@ -223,71 +156,17 @@ epoll_write_op::cancel() noexcept inline void epoll_op::operator()() { - stop_cb.reset(); - - socket_impl_->svc_.scheduler().reset_inline_budget(); - - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else if (is_read_operation() && bytes_transferred == 0) - *ec_out = capy::error::eof; - else - *ec_out = {}; - - *bytes_out = bytes_transferred; - - // Move to stack before resuming coroutine. The coroutine might close - // the socket, releasing the last wrapper ref. If impl_ptr were the - // last ref and we destroyed it while still in operator(), we'd have - // use-after-free. Moving to local ensures destruction happens at - // function exit, after all member accesses are complete. - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_io_op(*this); } inline void epoll_connect_op::operator()() { - stop_cb.reset(); - - socket_impl_->svc_.scheduler().reset_inline_budget(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - // Cache endpoints on successful connect - if (success && socket_impl_) - { - endpoint local_ep; - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd, reinterpret_cast(&local_storage), &local_len) == - 0) - local_ep = from_sockaddr(local_storage); - static_cast(socket_impl_) - ->set_endpoints(local_ep, target_endpoint); - } - - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - - // Move to stack before resuming. See epoll_op::operator()() for rationale. - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_connect_op(*this); } inline epoll_socket::epoll_socket(epoll_socket_service& svc) noexcept - : svc_(svc) + : reactor_socket(svc) { } @@ -301,59 +180,7 @@ epoll_socket::connect( std::stop_token token, std::error_code* ec) { - auto& op = conn_; - - sockaddr_storage storage{}; - socklen_t addrlen = - detail::to_sockaddr(ep, detail::socket_family(fd_), storage); - int result = ::connect(fd_, reinterpret_cast(&storage), addrlen); - - if (result == 0) - { - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd_, reinterpret_cast(&local_storage), &local_len) == - 0) - local_endpoint_ = detail::from_sockaddr(local_storage); - remote_endpoint_ = ep; - } - - if (result == 0 || errno != EINPROGRESS) - { - int err = (result < 0) ? errno : 0; - if (svc_.scheduler().try_consume_inline_budget()) - { - *ec = err ? make_err(err) : std::error_code{}; - return dispatch_coro(ex, h); - } - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.fd = fd_; - op.target_endpoint = ep; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EINPROGRESS — register with reactor - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.fd = fd_; - op.target_endpoint = ep; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.connect_op, desc_state_.write_ready, - desc_state_.connect_cancel_pending); - return std::noop_coroutine(); + return do_connect(h, ex, ep, token, ec); } inline std::coroutine_handle<> @@ -365,81 +192,7 @@ epoll_socket::read_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = rd_; - op.reset(); - - capy::mutable_buffer bufs[epoll_read_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, epoll_read_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.empty_buffer_read = true; - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(0, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - // Speculative read - ssize_t n; - do - { - n = ::readv(fd_, op.iovecs, op.iovec_count); - } - while (n < 0 && errno == EINTR); - - if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) - { - int err = (n < 0) ? errno : 0; - auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); - - if (svc_.scheduler().try_consume_inline_budget()) - { - if (err) - *ec = make_err(err); - else if (n == 0) - *ec = capy::error::eof; - else - *ec = {}; - *bytes_out = bytes; - return dispatch_coro(ex, h); - } - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, bytes); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EAGAIN — register with reactor - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.read_op, desc_state_.read_ready, - desc_state_.read_cancel_pending); - return std::noop_coroutine(); + return do_read_some(h, ex, param, token, ec, bytes_out); } inline std::coroutine_handle<> @@ -451,276 +204,19 @@ epoll_socket::write_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = wr_; - op.reset(); - - capy::mutable_buffer bufs[epoll_write_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, epoll_write_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(0, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - // Speculative write - msghdr msg{}; - msg.msg_iov = op.iovecs; - msg.msg_iovlen = static_cast(op.iovec_count); - - ssize_t n; - do - { - n = ::sendmsg(fd_, &msg, MSG_NOSIGNAL); - } - while (n < 0 && errno == EINTR); - - if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) - { - int err = (n < 0) ? errno : 0; - auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); - - if (svc_.scheduler().try_consume_inline_budget()) - { - *ec = err ? make_err(err) : std::error_code{}; - *bytes_out = bytes; - return dispatch_coro(ex, h); - } - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, bytes); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EAGAIN — register with reactor - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.write_op, desc_state_.write_ready, - desc_state_.write_cancel_pending); - return std::noop_coroutine(); -} - -inline std::error_code -epoll_socket::shutdown(tcp_socket::shutdown_type what) noexcept -{ - int how; - switch (what) - { - case tcp_socket::shutdown_receive: - how = SHUT_RD; - break; - case tcp_socket::shutdown_send: - how = SHUT_WR; - break; - case tcp_socket::shutdown_both: - how = SHUT_RDWR; - break; - default: - return make_err(EINVAL); - } - if (::shutdown(fd_, how) != 0) - return make_err(errno); - return {}; -} - -inline std::error_code -epoll_socket::set_option( - int level, int optname, void const* data, std::size_t size) noexcept -{ - if (::setsockopt(fd_, level, optname, data, static_cast(size)) != - 0) - return make_err(errno); - return {}; -} - -inline std::error_code -epoll_socket::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; + return do_write_some(h, ex, param, token, ec, bytes_out); } inline void epoll_socket::cancel() noexcept { - auto self = weak_from_this().lock(); - if (!self) - return; - - conn_.request_cancel(); - rd_.request_cancel(); - wr_.request_cancel(); - - epoll_op* conn_claimed = nullptr; - epoll_op* rd_claimed = nullptr; - epoll_op* wr_claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.connect_op == &conn_) - conn_claimed = std::exchange(desc_state_.connect_op, nullptr); - else - desc_state_.connect_cancel_pending = true; - if (desc_state_.read_op == &rd_) - rd_claimed = std::exchange(desc_state_.read_op, nullptr); - else - desc_state_.read_cancel_pending = true; - if (desc_state_.write_op == &wr_) - wr_claimed = std::exchange(desc_state_.write_op, nullptr); - else - desc_state_.write_cancel_pending = true; - } - - if (conn_claimed) - { - conn_.impl_ptr = self; - svc_.post(&conn_); - svc_.work_finished(); - } - if (rd_claimed) - { - rd_.impl_ptr = self; - svc_.post(&rd_); - svc_.work_finished(); - } - if (wr_claimed) - { - wr_.impl_ptr = self; - svc_.post(&wr_); - svc_.work_finished(); - } -} - -inline void -epoll_socket::cancel_single_op(epoll_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - op.request_cancel(); - - epoll_op** desc_op_ptr = nullptr; - if (&op == &conn_) - desc_op_ptr = &desc_state_.connect_op; - else if (&op == &rd_) - desc_op_ptr = &desc_state_.read_op; - else if (&op == &wr_) - desc_op_ptr = &desc_state_.write_op; - - if (desc_op_ptr) - { - epoll_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (*desc_op_ptr == &op) - claimed = std::exchange(*desc_op_ptr, nullptr); - else if (&op == &conn_) - desc_state_.connect_cancel_pending = true; - else if (&op == &rd_) - desc_state_.read_cancel_pending = true; - else if (&op == &wr_) - desc_state_.write_cancel_pending = true; - } - if (claimed) - { - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } - } + do_cancel(); } inline void epoll_socket::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - conn_.request_cancel(); - rd_.request_cancel(); - wr_.request_cancel(); - - epoll_op* conn_claimed = nullptr; - epoll_op* rd_claimed = nullptr; - epoll_op* wr_claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - conn_claimed = std::exchange(desc_state_.connect_op, nullptr); - rd_claimed = std::exchange(desc_state_.read_op, nullptr); - wr_claimed = std::exchange(desc_state_.write_op, nullptr); - desc_state_.read_ready = false; - desc_state_.write_ready = false; - desc_state_.read_cancel_pending = false; - desc_state_.write_cancel_pending = false; - desc_state_.connect_cancel_pending = false; - } - - if (conn_claimed) - { - conn_.impl_ptr = self; - svc_.post(&conn_); - svc_.work_finished(); - } - if (rd_claimed) - { - rd_.impl_ptr = self; - svc_.post(&rd_); - svc_.work_finished(); - } - if (wr_claimed) - { - wr_.impl_ptr = self; - svc_.post(&wr_); - svc_.work_finished(); - } - - if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) - desc_state_.impl_ref_ = self; - } - - if (fd_ >= 0) - { - if (desc_state_.registered_events != 0) - svc_.scheduler().deregister_descriptor(fd_); - ::close(fd_); - fd_ = -1; - } - - desc_state_.fd = -1; - desc_state_.registered_events = 0; - - local_endpoint_ = endpoint{}; - remote_endpoint_ = endpoint{}; + do_close_socket(); } inline epoll_socket_service::epoll_socket_service(capy::execution_context& ctx) @@ -737,10 +233,10 @@ epoll_socket_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->socket_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) impl->close_socket(); - // Don't clear socket_ptrs_ here. The scheduler shuts down after us and + // Don't clear impl_ptrs_ here. The scheduler shuts down after us and // drains completed_ops_, calling destroy() on each queued op. If we // released our shared_ptrs now, an epoll_op::destroy() could free the // last ref to an impl whose embedded descriptor_state is still linked @@ -757,8 +253,8 @@ epoll_socket_service::construct() { std::lock_guard lock(state_->mutex_); - state_->socket_list_.push_back(raw); - state_->socket_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); } return raw; @@ -770,8 +266,8 @@ epoll_socket_service::destroy(io_object::implementation* impl) auto* epoll_impl = static_cast(impl); epoll_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->socket_list_.remove(epoll_impl); - state_->socket_ptrs_.erase(epoll_impl); + state_->impl_list_.remove(epoll_impl); + state_->impl_ptrs_.erase(epoll_impl); } inline std::error_code @@ -813,7 +309,7 @@ epoll_socket_service::close(io_object::handle& h) } inline void -epoll_socket_service::post(epoll_op* op) +epoll_socket_service::post(scheduler_op* op) { state_->sched_.post(op); } diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_acceptor.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_acceptor.hpp index 34ed997c..d9fd7952 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_acceptor.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_acceptor.hpp @@ -15,13 +15,9 @@ #if BOOST_COROSIO_HAS_KQUEUE -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { @@ -29,9 +25,12 @@ class kqueue_acceptor_service; /// Acceptor implementation for kqueue backend. class kqueue_acceptor final - : public tcp_acceptor::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_acceptor< + kqueue_acceptor, + kqueue_acceptor_service, + kqueue_op, + kqueue_accept_op, + descriptor_state> { friend class kqueue_acceptor_service; @@ -65,55 +64,8 @@ class kqueue_acceptor final std::error_code* ec, io_object::implementation** out_impl) override; - int native_handle() const noexcept - { - return fd_; - } - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - bool is_open() const noexcept override - { - return fd_ >= 0; - } - - /** Cancel any pending accept operation. */ void cancel() noexcept override; - - std::error_code set_option( - int level, - int optname, - void const* data, - std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - - /** Cancel a specific pending operation. - - @param op The operation to cancel. - */ - void cancel_single_op(kqueue_op& op) noexcept; - - /** Close the listening socket and cancel pending operations. */ void close_socket() noexcept; - void set_local_endpoint(endpoint ep) noexcept - { - local_endpoint_ = ep; - } - - kqueue_acceptor_service& service() noexcept - { - return svc_; - } - -private: - kqueue_acceptor_service& svc_; - kqueue_accept_op acc_; - descriptor_state desc_state_; - int fd_ = -1; - endpoint local_endpoint_; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_acceptor_service.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_acceptor_service.hpp index 8fee98a6..6debc542 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_acceptor_service.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_acceptor_service.hpp @@ -22,14 +22,12 @@ #include #include #include +#include -#include -#include -#include +#include #include #include -#include #include #include @@ -40,24 +38,9 @@ namespace boost::corosio::detail { -/** State for kqueue acceptor service. */ -class kqueue_acceptor_state -{ - friend class kqueue_acceptor_service; - -public: - explicit kqueue_acceptor_state(kqueue_scheduler& sched) noexcept - : sched_(sched) - { - } - -private: - kqueue_scheduler& sched_; - std::mutex mutex_; - intrusive_list acceptor_list_; - std::unordered_map> - acceptor_ptrs_; -}; +/// State for kqueue acceptor service. +using kqueue_acceptor_state = + reactor_service_state; /** kqueue acceptor service implementation. @@ -91,7 +74,7 @@ class BOOST_COROSIO_DECL kqueue_acceptor_service final : public acceptor_service { return state_->sched_; } - void post(kqueue_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -115,139 +98,11 @@ kqueue_accept_op::cancel() noexcept inline void kqueue_accept_op::operator()() { - stop_cb.reset(); - - static_cast(acceptor_impl_) - ->service() - .scheduler() - .reset_inline_budget(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - } - - if (success && accepted_fd >= 0) - { - if (acceptor_impl_) - { - auto* socket_svc = static_cast(acceptor_impl_) - ->service() - .socket_service(); - if (socket_svc) - { - auto& impl = - static_cast(*socket_svc->construct()); - impl.set_socket(accepted_fd); - - // Register accepted socket with kqueue (edge-triggered via EV_CLEAR) - impl.desc_state_.fd = accepted_fd; - { - std::lock_guard lock(impl.desc_state_.mutex); - impl.desc_state_.read_op = nullptr; - impl.desc_state_.write_op = nullptr; - impl.desc_state_.connect_op = nullptr; - } - socket_svc->scheduler().register_descriptor( - accepted_fd, &impl.desc_state_); - - // Suppress SIGPIPE on the accepted socket; macOS lacks MSG_NOSIGNAL - int one = 1; - if (::setsockopt( - accepted_fd, SOL_SOCKET, SO_NOSIGPIPE, &one, - sizeof(one)) == -1) - { - if (ec_out) - *ec_out = make_err(errno); - socket_svc->destroy(&impl); - accepted_fd = -1; - if (impl_out) - *impl_out = nullptr; - } - else - { - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - sockaddr_storage remote_storage{}; - socklen_t remote_len = sizeof(remote_storage); - - endpoint local_ep, remote_ep; - if (::getsockname( - accepted_fd, - reinterpret_cast(&local_storage), - &local_len) == 0) - local_ep = from_sockaddr(local_storage); - if (::getpeername( - accepted_fd, - reinterpret_cast(&remote_storage), - &remote_len) == 0) - remote_ep = from_sockaddr(remote_storage); - - impl.set_endpoints(local_ep, remote_ep); - - if (impl_out) - *impl_out = &impl; - - accepted_fd = -1; - } - } - else - { - if (ec_out && !*ec_out) - *ec_out = make_err(ENOENT); - ::close(accepted_fd); - accepted_fd = -1; - if (impl_out) - *impl_out = nullptr; - } - } - else - { - ::close(accepted_fd); - accepted_fd = -1; - if (impl_out) - *impl_out = nullptr; - } - } - else - { - if (accepted_fd >= 0) - { - ::close(accepted_fd); - accepted_fd = -1; - } - - if (peer_impl) - { - auto* socket_svc_cleanup = - static_cast(acceptor_impl_) - ->service() - .socket_service(); - if (socket_svc_cleanup) - socket_svc_cleanup->destroy(peer_impl); - peer_impl = nullptr; - } - - if (impl_out) - *impl_out = nullptr; - } - - // Move to stack before resuming. See kqueue_op::operator()() for rationale. - capy::executor_ref saved_ex(std::move(ex)); - std::coroutine_handle<> saved_h(std::move(h)); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_accept_op(*this); } inline kqueue_acceptor::kqueue_acceptor(kqueue_acceptor_service& svc) noexcept - : svc_(svc) + : reactor_acceptor(svc) { } @@ -298,6 +153,21 @@ kqueue_acceptor::accept( return std::noop_coroutine(); } + // SO_NOSIGPIPE before budget check so both inline and + // queued paths have it applied (macOS lacks MSG_NOSIGNAL) + int one = 1; + if (::setsockopt( + accepted, SOL_SOCKET, SO_NOSIGPIPE, &one, + sizeof(one)) == -1) + { + int errn = errno; + ::close(accepted); + op.complete(errn, 0); + op.impl_ptr = shared_from_this(); + svc_.post(&op); + return std::noop_coroutine(); + } + { std::lock_guard lock(desc_state_.mutex); desc_state_.read_ready = false; @@ -322,49 +192,25 @@ kqueue_acceptor::accept( socket_svc->scheduler().register_descriptor( accepted, &impl.desc_state_); - // Suppress SIGPIPE on the accepted socket; macOS lacks MSG_NOSIGNAL - int one = 1; - if (::setsockopt( - accepted, SOL_SOCKET, SO_NOSIGPIPE, &one, - sizeof(one)) == -1) - { - int saved_errno = errno; - socket_svc->destroy(&impl); - if (ec) - *ec = make_err(saved_errno); - if (impl_out) - *impl_out = nullptr; - } - else - { - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - endpoint local_ep; - if (::getsockname( - accepted, - reinterpret_cast(&local_storage), - &local_len) == 0) - local_ep = from_sockaddr(local_storage); - impl.set_endpoints(local_ep, from_sockaddr(peer_storage)); - if (ec) - *ec = {}; - if (impl_out) - *impl_out = &impl; - } - return dispatch_coro(ex, h); + impl.set_endpoints( + local_endpoint_, from_sockaddr(peer_storage)); + + *ec = {}; + if (impl_out) + *impl_out = &impl; } else { ::close(accepted); - if (ec) - *ec = make_err(ENOENT); + *ec = make_err(ENOENT); if (impl_out) *impl_out = nullptr; - return dispatch_coro(ex, h); } + return dispatch_coro(ex, h); } - op.accepted_fd = accepted; + op.accepted_fd = accepted; + op.peer_storage = peer_storage; op.complete(0, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); @@ -373,60 +219,28 @@ kqueue_acceptor::accept( if (errno == EAGAIN || errno == EWOULDBLOCK) { - svc_.work_started(); op.impl_ptr = shared_from_this(); + svc_.work_started(); - bool perform_now = false; + std::lock_guard lock(desc_state_.mutex); + bool io_done = false; + if (desc_state_.read_ready) { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_ready) - { - desc_state_.read_ready = false; - perform_now = true; - } - else - { - desc_state_.read_op = &op; - } + desc_state_.read_ready = false; + op.perform_io(); + io_done = (op.errn != EAGAIN && op.errn != EWOULDBLOCK); + if (!io_done) + op.errn = 0; } - if (perform_now) + if (io_done || op.cancelled.load(std::memory_order_acquire)) { - for (;;) - { - op.perform_io(); - if (op.errn != EAGAIN && op.errn != EWOULDBLOCK) - { - svc_.post(&op); - svc_.work_finished(); - break; - } - op.errn = 0; - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_ready) - { - desc_state_.read_ready = false; - continue; - } - desc_state_.read_op = &op; - break; - } - return std::noop_coroutine(); + svc_.post(&op); + svc_.work_finished(); } - - if (op.cancelled.load(std::memory_order_acquire)) + else { - kqueue_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_op == &op) - claimed = std::exchange(desc_state_.read_op, nullptr); - } - if (claimed) - { - svc_.post(claimed); - svc_.work_finished(); - } + desc_state_.read_op = &op; } return std::noop_coroutine(); } @@ -440,86 +254,13 @@ kqueue_acceptor::accept( inline void kqueue_acceptor::cancel() noexcept { - auto self = weak_from_this().lock(); - if (!self) - return; - - acc_.request_cancel(); - - kqueue_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_op == &acc_) - claimed = std::exchange(desc_state_.read_op, nullptr); - } - if (claimed) - { - acc_.impl_ptr = self; - svc_.post(&acc_); - svc_.work_finished(); - } -} - -inline void -kqueue_acceptor::cancel_single_op(kqueue_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - op.request_cancel(); - - kqueue_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.read_op == &op) - claimed = std::exchange(desc_state_.read_op, nullptr); - } - if (claimed) - { - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } + do_cancel(); } inline void kqueue_acceptor::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - acc_.request_cancel(); - - kqueue_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - claimed = std::exchange(desc_state_.read_op, nullptr); - desc_state_.read_ready = false; - desc_state_.write_ready = false; - } - - if (claimed) - { - acc_.impl_ptr = self; - svc_.post(&acc_); - svc_.work_finished(); - } - - if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) - desc_state_.impl_ref_ = self; - } - - if (fd_ >= 0) - { - ::close(fd_); - fd_ = -1; - } - - desc_state_.fd = -1; - desc_state_.registered_events = 0; - - local_endpoint_ = endpoint{}; + do_close_socket(); } inline kqueue_acceptor_service::kqueue_acceptor_service( @@ -538,7 +279,7 @@ kqueue_acceptor_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->acceptor_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) impl->close_socket(); } @@ -549,8 +290,8 @@ kqueue_acceptor_service::construct() auto* raw = impl.get(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.push_back(raw); - state_->acceptor_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); return raw; } @@ -561,8 +302,8 @@ kqueue_acceptor_service::destroy(io_object::implementation* impl) auto* kq_impl = static_cast(impl); kq_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.remove(kq_impl); - state_->acceptor_ptrs_.erase(kq_impl); + state_->impl_list_.remove(kq_impl); + state_->impl_ptrs_.erase(kq_impl); } inline void @@ -571,27 +312,6 @@ kqueue_acceptor_service::close(io_object::handle& h) static_cast(h.get())->close_socket(); } -inline std::error_code -kqueue_acceptor::set_option( - int level, int optname, void const* data, std::size_t size) noexcept -{ - if (::setsockopt(fd_, level, optname, data, static_cast(size)) != - 0) - return make_err(errno); - return {}; -} - -inline std::error_code -kqueue_acceptor::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; -} - inline std::error_code kqueue_acceptor_service::open_acceptor_socket( tcp_acceptor::implementation& impl, int family, int type, int protocol) @@ -652,41 +372,18 @@ inline std::error_code kqueue_acceptor_service::bind_acceptor( tcp_acceptor::implementation& impl, endpoint ep) { - auto* kq_impl = static_cast(&impl); - int fd = kq_impl->fd_; - - sockaddr_storage storage{}; - socklen_t addrlen = detail::to_sockaddr(ep, storage); - if (::bind(fd, reinterpret_cast(&storage), addrlen) < 0) - return make_err(errno); - - // Cache local endpoint (resolves ephemeral port) - sockaddr_storage local{}; - socklen_t local_len = sizeof(local); - if (::getsockname(fd, reinterpret_cast(&local), &local_len) == 0) - kq_impl->set_local_endpoint(detail::from_sockaddr(local)); - - return {}; + return static_cast(&impl)->do_bind(ep); } inline std::error_code kqueue_acceptor_service::listen_acceptor( tcp_acceptor::implementation& impl, int backlog) { - auto* kq_impl = static_cast(&impl); - int fd = kq_impl->fd_; - - if (::listen(fd, backlog) < 0) - return make_err(errno); - - // Register fd with kqueue - scheduler().register_descriptor(fd, &kq_impl->desc_state_); - - return {}; + return static_cast(&impl)->do_listen(backlog); } inline void -kqueue_acceptor_service::post(kqueue_op* op) +kqueue_acceptor_service::post(scheduler_op* op) { state_->sched_.post(op); } diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_op.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_op.hpp index 6245a92f..f34b6a5e 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_op.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_op.hpp @@ -15,30 +15,11 @@ #if BOOST_COROSIO_HAS_KQUEUE -#include -#include -#include -#include -#include -#include -#include +#include +#include -#include - -#include -#include #include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include +#include /* kqueue Operation State @@ -79,13 +60,11 @@ namespace boost::corosio::detail { -// Ready-event flag constants for descriptor_state::ready_events_. -// These match the epoll numeric values (EPOLLIN=0x1, EPOLLOUT=0x4, -// EPOLLERR=0x8) so that descriptor_state::operator()() uses the same -// flag-checking logic as the epoll backend. -static constexpr std::uint32_t kqueue_event_read = 0x001; -static constexpr std::uint32_t kqueue_event_write = 0x004; -static constexpr std::uint32_t kqueue_event_error = 0x008; +// Aliases for shared reactor event constants. +// Kept for backward compatibility in kqueue-specific code. +static constexpr std::uint32_t kqueue_event_read = reactor_event_read; +static constexpr std::uint32_t kqueue_event_write = reactor_event_write; +static constexpr std::uint32_t kqueue_event_error = reactor_event_error; // Forward declarations class kqueue_socket; @@ -94,326 +73,111 @@ struct kqueue_op; class kqueue_scheduler; -/** Per-descriptor state for persistent kqueue registration. - - Tracks pending operations for a file descriptor. The fd is registered - once with kqueue (EVFILT_READ + EVFILT_WRITE, both EV_CLEAR) and stays - registered until closed. +/// Per-descriptor state for persistent kqueue registration. +struct descriptor_state final : reactor_descriptor_state +{}; - This struct extends scheduler_op to support deferred I/O processing. - When kqueue events arrive, the reactor sets ready_events and queues - this descriptor for processing. When popped from the scheduler queue, - operator() performs the actual I/O and queues completion handlers. - - @par Deferred I/O Model - The reactor no longer performs I/O directly. Instead: - 1. Reactor sets ready_events and queues descriptor_state - 2. Scheduler pops descriptor_state and calls operator() - 3. operator() performs I/O under mutex and queues completions - - This eliminates per-descriptor mutex locking from the reactor hot path. - - @par Thread Safety - The mutex protects operation pointers and ready flags during I/O. - ready_events_ and is_enqueued_ are atomic for lock-free reactor access. -*/ -struct descriptor_state final : scheduler_op +/// kqueue base operation — thin wrapper over reactor_op. +struct kqueue_op : reactor_op { - std::mutex mutex; - - // Protected by mutex - kqueue_op* read_op = nullptr; - kqueue_op* write_op = nullptr; - kqueue_op* connect_op = nullptr; - - // Caches edge events that arrived before an op was registered - bool read_ready = false; - bool write_ready = false; - - // Deferred cancellation: set by cancel() when the target op is not - // parked (e.g. completing inline via speculative I/O). Checked when - // the next op parks; if set, the op is immediately self-cancelled. - // This matches IOCP semantics where CancelIoEx always succeeds. - bool read_cancel_pending = false; - bool write_cancel_pending = false; - bool connect_cancel_pending = false; - - // Set during registration only (no mutex needed) - std::uint32_t registered_events = 0; - int fd = -1; - - // For deferred I/O - set by reactor, read by scheduler - std::atomic ready_events_{0}; - std::atomic is_enqueued_{false}; - kqueue_scheduler const* scheduler_ = nullptr; - - // Prevents impl destruction while this descriptor_state is queued. - // Set by close_socket() when is_enqueued_ is true, cleared by operator(). - std::shared_ptr impl_ref_; - - /// Add ready events atomically. - /// Release pairs with the consumer's acquire exchange on - /// ready_events_ so the consumer sees all flags. On x86 (TSO) - /// this compiles to the same LOCK OR as relaxed. - void add_ready_events(std::uint32_t ev) noexcept - { - ready_events_.fetch_or(ev, std::memory_order_release); - } - - /// Perform deferred I/O and queue completions. void operator()() override; - - /// Destroy without invoking. - /// Called during scheduler::shutdown() drain. Clear impl_ref_ to break - /// the self-referential cycle set by close_socket(). - void destroy() override - { - impl_ref_.reset(); - } }; -struct kqueue_op : scheduler_op +/// kqueue connect operation. +struct kqueue_connect_op final : reactor_connect_op { - struct canceller - { - kqueue_op* op; - void operator()() const noexcept; - }; - - std::coroutine_handle<> h; - capy::executor_ref ex; - std::error_code* ec_out = nullptr; - std::size_t* bytes_out = nullptr; - - int fd = -1; - int errn = 0; - std::size_t bytes_transferred = 0; - - std::atomic cancelled{false}; - std::optional> stop_cb; - - // Prevents use-after-free when socket is closed with pending ops. - // See "Impl Lifetime Management" in file header. - std::shared_ptr impl_ptr; - - // For stop_token cancellation - pointer to owning socket/acceptor impl. - // When stop is requested, we call back to the impl to perform actual I/O cancellation. - kqueue_socket* socket_impl_ = nullptr; - kqueue_acceptor* acceptor_impl_ = nullptr; - - kqueue_op() = default; - - void reset() noexcept - { - fd = -1; - errn = 0; - bytes_transferred = 0; - cancelled.store(false, std::memory_order_relaxed); - impl_ptr.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = nullptr; - } - - // Defined in sockets.cpp where kqueue_socket is complete void operator()() override; - - virtual bool is_read_operation() const noexcept - { - return false; - } - virtual void cancel() noexcept = 0; - - void destroy() override - { - stop_cb.reset(); - impl_ptr.reset(); - } - - void request_cancel() noexcept - { - cancelled.store(true, std::memory_order_release); - } - - void start(std::stop_token token, kqueue_socket* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = impl; - acceptor_impl_ = nullptr; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void start(std::stop_token token, kqueue_acceptor* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = impl; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void complete(int err, std::size_t bytes) noexcept - { - errn = err; - bytes_transferred = bytes; - } - - virtual void perform_io() noexcept {} + void cancel() noexcept override; }; -struct kqueue_connect_op final : kqueue_op +/// kqueue scatter-read operation. +struct kqueue_read_op final : reactor_read_op { - endpoint target_endpoint; - - void reset() noexcept - { - kqueue_op::reset(); - target_endpoint = endpoint{}; - } - - void perform_io() noexcept override - { - // connect() completion status is retrieved via SO_ERROR, not return value - int err = 0; - socklen_t len = sizeof(err); - if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) - err = errno; - complete(err, 0); - } - - // Defined in sockets.cpp where kqueue_socket is complete - void operator()() override; void cancel() noexcept override; }; -struct kqueue_read_op final : kqueue_op -{ - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - bool empty_buffer_read = false; - - bool is_read_operation() const noexcept override - { - return !empty_buffer_read; - } +/** Provides writev() for kqueue writes. - void reset() noexcept - { - kqueue_op::reset(); - iovec_count = 0; - empty_buffer_read = false; - } - - void perform_io() noexcept override + SO_NOSIGPIPE is set on the socket at creation time (macOS lacks + MSG_NOSIGNAL), so writev() is safe from SIGPIPE. +*/ +struct kqueue_write_policy +{ + static ssize_t write(int fd, iovec* iovecs, int count) noexcept { - ssize_t n = ::readv(fd, iovecs, iovec_count); - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); + ssize_t n; + do + { + n = ::writev(fd, iovecs, count); + } + while (n < 0 && errno == EINTR); + return n; } - - void cancel() noexcept override; }; -struct kqueue_write_op final : kqueue_op +/// kqueue gather-write operation. +struct kqueue_write_op final : reactor_write_op { - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - - void reset() noexcept - { - kqueue_op::reset(); - iovec_count = 0; - } - - void perform_io() noexcept override - { - // SO_NOSIGPIPE is set on the socket at creation time (see sockets.cpp), - // so writev() is safe from SIGPIPE. - // FreeBSD: Supports MSG_NOSIGNAL on sendmsg() - ssize_t n = ::writev(fd, iovecs, iovec_count); - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); - } - void cancel() noexcept override; }; -struct kqueue_accept_op final : kqueue_op -{ - int accepted_fd = -1; - io_object::implementation* peer_impl = nullptr; - io_object::implementation** impl_out = nullptr; +/** Provides accept() + fcntl() + SO_NOSIGPIPE for kqueue accepts. - void reset() noexcept + Unlike Linux's accept4(), BSD accept() does not support atomic + flag setting. Non-blocking, close-on-exec, and SIGPIPE suppression + are applied via separate syscalls after accept(). +*/ +struct kqueue_accept_policy +{ + static int do_accept(int fd, sockaddr_storage& peer) noexcept { - kqueue_op::reset(); - accepted_fd = -1; - peer_impl = nullptr; - impl_out = nullptr; - } + int new_fd; + do + { + socklen_t addrlen = sizeof(peer); + new_fd = ::accept(fd, reinterpret_cast(&peer), &addrlen); + } + while (new_fd < 0 && errno == EINTR); - void perform_io() noexcept override - { - sockaddr_storage addr_storage{}; - socklen_t addrlen = sizeof(addr_storage); + if (new_fd < 0) + return new_fd; - // FreeBSD: Can use accept4(fd, addr, len, SOCK_NONBLOCK | SOCK_CLOEXEC) - int new_fd = - ::accept(fd, reinterpret_cast(&addr_storage), &addrlen); + int flags = ::fcntl(new_fd, F_GETFL, 0); + if (flags == -1 || ::fcntl(new_fd, F_SETFL, flags | O_NONBLOCK) == -1) + { + int err = errno; + ::close(new_fd); + errno = err; + return -1; + } - if (new_fd >= 0) + if (::fcntl(new_fd, F_SETFD, FD_CLOEXEC) == -1) { - // Set non-blocking - int flags = ::fcntl(new_fd, F_GETFL, 0); - if (flags == -1 || - ::fcntl(new_fd, F_SETFL, flags | O_NONBLOCK) == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - // Set close-on-exec - if (::fcntl(new_fd, F_SETFD, FD_CLOEXEC) == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - // Suppress SIGPIPE on accepted sockets; macOS lacks MSG_NOSIGNAL - int one = 1; - if (::setsockopt( - new_fd, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)) == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - accepted_fd = new_fd; - complete(0, 0); + int err = errno; + ::close(new_fd); + errno = err; + return -1; } - else + + // macOS lacks MSG_NOSIGNAL + int one = 1; + if (::setsockopt(new_fd, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)) == + -1) { - complete(errno, 0); + int err = errno; + ::close(new_fd); + errno = err; + return -1; } + + return new_fd; } +}; - // Defined in acceptors.cpp where kqueue_acceptor is complete +/// kqueue accept operation. +struct kqueue_accept_op final + : reactor_accept_op +{ void operator()() override; void cancel() noexcept override; }; diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_scheduler.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_scheduler.hpp index 4da95cc8..f829aeea 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_scheduler.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_scheduler.hpp @@ -18,75 +18,32 @@ #include #include -#include -#include +#include + #include #include #include #include #include + #include -#include #include #include -#include -#include #include #include #include -#include #include #include #include -#include #include #include -/* - kqueue Scheduler - Single Reactor Model - ======================================== - - This scheduler uses the same thread coordination strategy as the epoll - backend to provide handler parallelism and avoid the thundering herd problem. - Instead of all threads blocking on kevent(), one thread becomes the - "reactor" while others wait on a condition variable for handler work. - - Thread Model - ------------ - - ONE thread runs kevent() at a time (the reactor thread) - - OTHER threads wait on cond_ (condition variable) for handlers - - When work is posted, exactly one waiting thread wakes via notify_one() - - This matches Windows IOCP semantics where N posted items wake N threads - - Event Loop Structure (do_one) - ----------------------------- - 1. Lock mutex, try to pop handler from queue - 2. If got handler: execute it (unlocked), return - 3. If queue empty and no reactor running: become reactor - - Run kevent() (unlocked), queue I/O completions, loop back - 4. If queue empty and reactor running: wait on condvar for work - - kqueue-Specific Design - ---------------------- - - Uses EVFILT_USER for reactor interruption (no extra fd needed) - - Uses EV_CLEAR for edge-triggered semantics (equivalent to EPOLLET) - - Timer expiry computed from timer_service, passed as kevent() timeout - - No timerfd equivalent; uses software timer queue - - Signaling State (state_) - ------------------------ - Same as epoll: bit 0 = signaled, upper bits = waiter count. -*/ - namespace boost::corosio::detail { struct kqueue_op; struct descriptor_state; -namespace kqueue { -struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context; -} // namespace kqueue /** macOS/BSD scheduler using kqueue for I/O multiplexing. @@ -111,13 +68,9 @@ struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context; @par Thread Safety All public member functions are thread-safe. */ -class BOOST_COROSIO_DECL kqueue_scheduler final - : public native_scheduler - , public capy::execution_context::service +class BOOST_COROSIO_DECL kqueue_scheduler final : public reactor_scheduler_base { public: - using key_type = scheduler; - /** Construct the scheduler. Creates a kqueue file descriptor via kqueue(), sets @@ -144,18 +97,8 @@ class BOOST_COROSIO_DECL kqueue_scheduler final kqueue_scheduler(kqueue_scheduler const&) = delete; kqueue_scheduler& operator=(kqueue_scheduler const&) = delete; + /// Shut down the scheduler, draining pending operations. void shutdown() override; - void post(std::coroutine_handle<> h) const override; - void post(scheduler_op* h) const override; - bool running_in_this_thread() const noexcept override; - void stop() override; - bool stopped() const noexcept override; - void restart() override; - std::size_t run() override; - std::size_t run_one() override; - std::size_t wait_one(long usec) override; - std::size_t poll() override; - std::size_t poll_one() override; /** Return the kqueue file descriptor. @@ -169,43 +112,12 @@ class BOOST_COROSIO_DECL kqueue_scheduler final return kq_fd_; } - /** Reset the thread's inline completion budget. - - Called at the start of each posted completion handler to - grant a fresh budget for speculative inline completions. - Operates in two modes depending on whether another thread - absorbed queued work from the previous dispatch cycle: - - - **Adaptive** (default): the effective cap ramps up when - the previous cycle fully consumed its budget (doubles up - to 16) and ramps down to the floor (2) when budget was - only partially consumed, tracking actual inline demand. - - **Unassisted**: entered when no other thread was available - to signal (unlock_and_signal_one returned false). Applies - a fixed conservative cap (4) to amortize scheduling - overhead for small buffers while avoiding bursty I/O that - fills socket buffers and stalls large transfers. - */ - void reset_inline_budget() const noexcept; - - /** Consume one unit of inline budget if available. - - @return True if budget was available and consumed. - */ - bool try_consume_inline_budget() const noexcept; - /** Register a descriptor for persistent monitoring. Adds EVFILT_READ and EVFILT_WRITE (both EV_CLEAR) for @a fd and stores @a desc in the kevent udata field so that the reactor can dispatch events to the correct descriptor_state. - The caller retains ownership of @a desc. It must remain valid - until deregister_descriptor() is called and all pending - read/write/connect operations referencing it have completed. - The scheduler accesses @a desc asynchronously from the reactor - thread when kevent delivers events. - @param fd The file descriptor to register. @param desc Pointer to the caller-owned descriptor_state. @@ -219,500 +131,25 @@ class BOOST_COROSIO_DECL kqueue_scheduler final Errors are silently ignored because the fd may already be closed and kqueue automatically removes closed descriptors. - After this call returns, the reactor will not deliver any - further events for @a fd, so the associated descriptor_state - may be safely destroyed once all previously queued completions - have been processed. - @param fd The file descriptor to deregister. */ void deregister_descriptor(int fd) const; - void work_started() noexcept override; - void work_finished() noexcept override; - - /** Offset a forthcoming work_finished from work_cleanup. - - Called by descriptor_state when all I/O returned EAGAIN and no - handler will be executed. Must be called from a scheduler thread. - */ - void compensating_work_started() const noexcept; - - /** Drain work from thread context's private queue to global queue. - - Called by thread_context_guard destructor when a thread exits run(). - Transfers pending work to the global queue under mutex protection. - - @param queue The private queue to drain. - @param count Item count for wakeup decisions (wakes other threads if positive). - */ - void drain_thread_queue(op_queue& queue, std::int64_t count) const; - - /** Post completed operations for deferred invocation. - - If called from a thread running this scheduler, operations go to - the thread's private queue (fast path). Otherwise, operations are - added to the global queue under mutex and a waiter is signaled. - - @par Preconditions - work_started() must have been called for each operation. - - @param ops Queue of operations to post. - */ - void post_deferred_completions(op_queue& ops) const; - private: - struct work_cleanup - { - kqueue_scheduler* scheduler; - std::unique_lock* lock; - kqueue::scheduler_context* ctx; - ~work_cleanup(); - }; - - struct task_cleanup - { - kqueue_scheduler const* scheduler; - kqueue::scheduler_context* ctx; - ~task_cleanup(); - }; - - std::size_t do_one( - std::unique_lock& lock, - long timeout_us, - kqueue::scheduler_context* ctx); - void run_task( - std::unique_lock& lock, kqueue::scheduler_context* ctx); - void wake_one_thread_and_unlock(std::unique_lock& lock) const; - void interrupt_reactor() const; + void + run_task(std::unique_lock& lock, context_type* ctx) override; + void interrupt_reactor() const override; long calculate_timeout(long requested_timeout_us) const; - /** Set the signaled state and wake all waiting threads. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - */ - void signal_all(std::unique_lock& lock) const; - - /** Set the signaled state and wake one waiter if any exist. - - Only unlocks and signals if at least one thread is waiting. - Use this when the caller needs to perform a fallback action - (such as interrupting the reactor) when no waiters exist. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - - @return `true` if unlocked and signaled, `false` if lock still held. - */ - bool maybe_unlock_and_signal_one(std::unique_lock& lock) const; - - /** Set the signaled state, unlock, and wake one waiter if any exist. - - Always unlocks the mutex. Use this when the caller will release - the lock regardless of whether a waiter exists. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - - @return `true` if at least one waiter was signaled, - `false` if no waiters existed. - */ - bool unlock_and_signal_one(std::unique_lock& lock) const; - - /** Clear the signaled state before waiting. - - @par Preconditions - Mutex must be held. - */ - void clear_signal() const; - - /** Block until the signaled state is set. - - Returns immediately if already signaled (fast-path). Otherwise - increments the waiter count, waits on the condition variable, - and decrements the waiter count upon waking. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - */ - void wait_for_signal(std::unique_lock& lock) const; - - /** Block until signaled or timeout expires. - - @par Preconditions - Mutex must be held. - - @param lock The held mutex lock. - @param timeout_us Maximum time to wait in microseconds. - */ - void wait_for_signal_for( - std::unique_lock& lock, long timeout_us) const; - int kq_fd_; - mutable std::mutex mutex_; - mutable std::condition_variable cond_; - mutable op_queue completed_ops_; - mutable std::atomic outstanding_work_{0}; - std::atomic stopped_{false}; - - // True while a thread is blocked in kevent(). Used by - // wake_one_thread_and_unlock and work_finished to know when - // an EVFILT_USER interrupt is needed instead of a condvar signal. - mutable bool task_running_ = false; - - // True when the reactor has been told to do a non-blocking poll - // (more handlers queued or poll mode). Prevents redundant EVFILT_USER - // triggers and controls the kevent() timeout. - mutable bool task_interrupted_ = false; - - // Signaling state: bit 0 = signaled, upper bits = waiter count - static constexpr std::size_t signaled_bit = 1; - static constexpr std::size_t waiter_increment = 2; - mutable std::size_t state_ = 0; - - // EVFILT_USER idempotency: prevents redundant NOTE_TRIGGER writes - mutable std::atomic user_event_armed_{false}; - - // Sentinel operation for interleaving reactor runs with handler execution. - // Ensures the reactor runs periodically even when handlers are continuously - // posted, preventing starvation of I/O events, timers, and signals. - struct task_op final : scheduler_op - { - void operator()() override {} - void destroy() override {} - }; - task_op task_op_; -}; - -// -- Implementation --------------------------------------------------------- - -namespace kqueue { - -struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context -{ - kqueue_scheduler const* key; - scheduler_context* next; - op_queue private_queue; - std::int64_t private_outstanding_work; - int inline_budget; - int inline_budget_max; - bool unassisted; - - scheduler_context(kqueue_scheduler const* k, scheduler_context* n) - : key(k) - , next(n) - , private_outstanding_work(0) - , inline_budget(0) - , inline_budget_max(2) - , unassisted(false) - { - } -}; - -inline thread_local_ptr context_stack; -struct thread_context_guard -{ - scheduler_context frame_; - - explicit thread_context_guard(kqueue_scheduler const* ctx) noexcept - : frame_(ctx, context_stack.get()) - { - context_stack.set(&frame_); - } - - ~thread_context_guard() noexcept - { - if (!frame_.private_queue.empty()) - frame_.key->drain_thread_queue( - frame_.private_queue, frame_.private_outstanding_work); - context_stack.set(frame_.next); - } + // EVFILT_USER idempotency + mutable std::atomic user_event_armed_{false}; }; -inline scheduler_context* -find_context(kqueue_scheduler const* self) noexcept -{ - for (auto* c = context_stack.get(); c != nullptr; c = c->next) - if (c->key == self) - return c; - return nullptr; -} - -/// Flush private work count to global counter. -inline void -flush_private_work( - scheduler_context* ctx, - std::atomic& outstanding_work) noexcept -{ - if (ctx && ctx->private_outstanding_work > 0) - { - outstanding_work.fetch_add( - ctx->private_outstanding_work, std::memory_order_relaxed); - ctx->private_outstanding_work = 0; - } -} - -/// Drain private queue to global queue, flushing work count first. -/// -/// @return True if any ops were drained. -inline bool -drain_private_queue( - scheduler_context* ctx, - std::atomic& outstanding_work, - op_queue& completed_ops) noexcept -{ - if (!ctx || ctx->private_queue.empty()) - return false; - - flush_private_work(ctx, outstanding_work); - completed_ops.splice(ctx->private_queue); - return true; -} - -} // namespace kqueue - -inline void -kqueue_scheduler::reset_inline_budget() const noexcept -{ - if (auto* ctx = kqueue::find_context(this)) - { - // Cap when no other thread absorbed queued work. A moderate - // cap (4) amortizes scheduling for small buffers while avoiding - // bursty I/O that fills socket buffers and stalls large transfers. - if (ctx->unassisted) - { - ctx->inline_budget_max = 4; - ctx->inline_budget = 4; - return; - } - // Ramp up when previous cycle fully consumed budget. - // Reset on partial consumption (EAGAIN hit or peer got scheduled). - if (ctx->inline_budget == 0) - ctx->inline_budget_max = (std::min)(ctx->inline_budget_max * 2, 16); - else if (ctx->inline_budget < ctx->inline_budget_max) - ctx->inline_budget_max = 2; - ctx->inline_budget = ctx->inline_budget_max; - } -} - -inline bool -kqueue_scheduler::try_consume_inline_budget() const noexcept -{ - if (auto* ctx = kqueue::find_context(this)) - { - if (ctx->inline_budget > 0) - { - --ctx->inline_budget; - return true; - } - } - return false; -} - -inline void -descriptor_state::operator()() -{ - // Release ensures the false is visible to the reactor's CAS on other - // cores. With relaxed, ARM's store buffer can delay the write, - // causing the reactor's CAS to see a stale 'true' and skip - // enqueue—permanently losing the edge-triggered event and - // eventually deadlocking. On x86 (TSO) release compiles to the - // same MOV as relaxed, so there is no cost there. - is_enqueued_.store(false, std::memory_order_release); - - // Take ownership of impl ref set by close_socket() to prevent - // the owning impl from being freed while we're executing - auto prevent_impl_destruction = std::move(impl_ref_); - - std::uint32_t ev = ready_events_.exchange(0, std::memory_order_acquire); - if (ev == 0) - { - scheduler_->compensating_work_started(); - return; - } - - op_queue local_ops; - - int err = 0; - if (ev & kqueue_event_error) - { - socklen_t len = sizeof(err); - if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) - err = errno; - if (err == 0) - err = EIO; - } - - kqueue_op* rd = nullptr; - kqueue_op* wr = nullptr; - kqueue_op* cn = nullptr; - { - std::lock_guard lock(mutex); - if (ev & kqueue_event_read) - { - rd = std::exchange(read_op, nullptr); - if (!rd) - read_ready = true; - } - if (ev & kqueue_event_write) - { - cn = std::exchange(connect_op, nullptr); - wr = std::exchange(write_op, nullptr); - if (!cn && !wr) - write_ready = true; - } - if (err && !(ev & (kqueue_event_read | kqueue_event_write))) - { - rd = std::exchange(read_op, nullptr); - wr = std::exchange(write_op, nullptr); - cn = std::exchange(connect_op, nullptr); - } - } - - // Non-null after I/O means EAGAIN; re-register under lock below - if (rd) - { - if (err) - rd->complete(err, 0); - else - rd->perform_io(); - - if (rd->errn == EAGAIN || rd->errn == EWOULDBLOCK) - { - rd->errn = 0; - } - else - { - local_ops.push(rd); - rd = nullptr; - } - } - - if (cn) - { - if (err) - cn->complete(err, 0); - else - cn->perform_io(); - local_ops.push(cn); - cn = nullptr; - } - - if (wr) - { - if (err) - wr->complete(err, 0); - else - wr->perform_io(); - - if (wr->errn == EAGAIN || wr->errn == EWOULDBLOCK) - { - wr->errn = 0; - } - else - { - local_ops.push(wr); - wr = nullptr; - } - } - - // Re-register EAGAIN ops. A concurrent operator()() invocation may - // have set read_ready/write_ready while we held the op (no read_op - // was registered, so it cached the edge event). Check the flags - // under the same lock as re-registration so no edge is lost. - while (rd || wr) - { - bool retry = false; - { - std::lock_guard lock(mutex); - if (rd) - { - if (read_ready) - { - read_ready = false; - retry = true; - } - else - { - read_op = rd; - rd = nullptr; - } - } - if (wr) - { - if (write_ready) - { - write_ready = false; - retry = true; - } - else - { - write_op = wr; - wr = nullptr; - } - } - } - - if (!retry) - break; - - if (rd) - { - rd->perform_io(); - if (rd->errn == EAGAIN || rd->errn == EWOULDBLOCK) - rd->errn = 0; - else - { - local_ops.push(rd); - rd = nullptr; - } - } - if (wr) - { - wr->perform_io(); - if (wr->errn == EAGAIN || wr->errn == EWOULDBLOCK) - wr->errn = 0; - else - { - local_ops.push(wr); - wr = nullptr; - } - } - } - - // Execute first handler inline — the scheduler's work_cleanup - // accounts for this as the "consumed" work item - scheduler_op* first = local_ops.pop(); - if (first) - { - scheduler_->post_deferred_completions(local_ops); - (*first)(); - } - else - { - scheduler_->compensating_work_started(); - } -} - inline kqueue_scheduler::kqueue_scheduler(capy::execution_context& ctx, int) : kq_fd_(-1) - , outstanding_work_(0) - , stopped_(false) - , task_running_(false) - , task_interrupted_(false) - , state_(0) { - // FreeBSD 13+: kqueue1(O_CLOEXEC) available kq_fd_ = ::kqueue(); if (kq_fd_ < 0) detail::throw_system_error(make_err(errno), "kqueue"); @@ -724,8 +161,6 @@ inline kqueue_scheduler::kqueue_scheduler(capy::execution_context& ctx, int) detail::throw_system_error(make_err(errn), "fcntl (kqueue FD_CLOEXEC)"); } - // Register EVFILT_USER for reactor interruption (no self-pipe fallback). - // Requires FreeBSD 11+ or macOS 10.6+; fails with throw on older kernels. struct kevent ev; EV_SET(&ev, 0, EVFILT_USER, EV_ADD | EV_CLEAR, 0, 0, nullptr); if (::kevent(kq_fd_, &ev, 1, nullptr, 0, nullptr) < 0) @@ -741,13 +176,9 @@ inline kqueue_scheduler::kqueue_scheduler(capy::execution_context& ctx, int) static_cast(p)->interrupt_reactor(); })); - // Initialize resolver service get_resolver_service(ctx, *this); - - // Initialize signal service get_signal_service(ctx, *this); - // Push task sentinel to interleave reactor runs with handler execution completed_ops_.push(&task_op_); } @@ -760,220 +191,12 @@ inline kqueue_scheduler::~kqueue_scheduler() inline void kqueue_scheduler::shutdown() { - { - std::unique_lock lock(mutex_); - - while (auto* h = completed_ops_.pop()) - { - if (h == &task_op_) - continue; - lock.unlock(); - h->destroy(); - lock.lock(); - } - - signal_all(lock); - } + shutdown_drain(); if (kq_fd_ >= 0) interrupt_reactor(); } -inline void -kqueue_scheduler::post(std::coroutine_handle<> h) const -{ - struct post_handler final : scheduler_op - { - std::coroutine_handle<> h_; - - explicit post_handler(std::coroutine_handle<> h) : h_(h) {} - - ~post_handler() = default; - - void operator()() override - { - auto h = h_; - delete this; - // Acquire fence on *this thread* (not the deleted object) ensures - // stores made by the posting thread (e.g. coroutine state written - // before the cross-thread post) are visible before we resume. - std::atomic_thread_fence(std::memory_order_acquire); - h.resume(); - } - - void destroy() override - { - auto h = h_; - delete this; - h.destroy(); - } - }; - - auto ph = std::make_unique(h); - - // Fast path: same thread posts to private queue - // Only count locally; work_cleanup batches to global counter - if (auto* ctx = kqueue::find_context(this)) - { - ++ctx->private_outstanding_work; - ctx->private_queue.push(ph.release()); - return; - } - - // Slow path: cross-thread post requires mutex - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(ph.release()); - wake_one_thread_and_unlock(lock); -} - -inline void -kqueue_scheduler::post(scheduler_op* h) const -{ - // Fast path: same thread posts to private queue - // Only count locally; work_cleanup batches to global counter - if (auto* ctx = kqueue::find_context(this)) - { - ++ctx->private_outstanding_work; - ctx->private_queue.push(h); - return; - } - - // Slow path: cross-thread post requires mutex - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(h); - wake_one_thread_and_unlock(lock); -} - -inline bool -kqueue_scheduler::running_in_this_thread() const noexcept -{ - for (auto* c = kqueue::context_stack.get(); c != nullptr; c = c->next) - if (c->key == this) - return true; - return false; -} - -inline void -kqueue_scheduler::stop() -{ - std::unique_lock lock(mutex_); - if (!stopped_.load(std::memory_order_relaxed)) - { - stopped_.store(true, std::memory_order_release); - signal_all(lock); - interrupt_reactor(); - } -} - -inline bool -kqueue_scheduler::stopped() const noexcept -{ - return stopped_.load(std::memory_order_acquire); -} - -inline void -kqueue_scheduler::restart() -{ - std::unique_lock lock(mutex_); - stopped_.store(false, std::memory_order_release); -} - -inline std::size_t -kqueue_scheduler::run() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - kqueue::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - - std::size_t n = 0; - for (;;) - { - if (!do_one(lock, -1, &ctx.frame_)) - break; - if (n != (std::numeric_limits::max)()) - ++n; - if (!lock.owns_lock()) - lock.lock(); - } - return n; -} - -inline std::size_t -kqueue_scheduler::run_one() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - kqueue::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, -1, &ctx.frame_); -} - -inline std::size_t -kqueue_scheduler::wait_one(long usec) -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - kqueue::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, usec, &ctx.frame_); -} - -inline std::size_t -kqueue_scheduler::poll() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - kqueue::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - - std::size_t n = 0; - for (;;) - { - if (!do_one(lock, 0, &ctx.frame_)) - break; - if (n != (std::numeric_limits::max)()) - ++n; - if (!lock.owns_lock()) - lock.lock(); - } - return n; -} - -inline std::size_t -kqueue_scheduler::poll_one() -{ - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - kqueue::thread_context_guard ctx(this); - std::unique_lock lock(mutex_); - return do_one(lock, 0, &ctx.frame_); -} - inline void kqueue_scheduler::register_descriptor(int fd, descriptor_state* desc) const { @@ -991,8 +214,10 @@ kqueue_scheduler::register_descriptor(int fd, descriptor_state* desc) const desc->registered_events = kqueue_event_read | kqueue_event_write; desc->fd = fd; desc->scheduler_ = this; + desc->ready_events_.store(0, std::memory_order_relaxed); std::lock_guard lock(desc->mutex); + desc->impl_ref_.reset(); desc->read_ready = false; desc->write_ready = false; } @@ -1007,72 +232,12 @@ kqueue_scheduler::deregister_descriptor(int fd) const EV_SET( &changes[1], static_cast(fd), EVFILT_WRITE, EV_DELETE, 0, 0, nullptr); - // Ignore errors - fd may already be closed (kqueue auto-removes on close) ::kevent(kq_fd_, changes, 2, nullptr, 0, nullptr); } -inline void -kqueue_scheduler::work_started() noexcept -{ - outstanding_work_.fetch_add(1, std::memory_order_relaxed); -} - -inline void -kqueue_scheduler::work_finished() noexcept -{ - if (outstanding_work_.fetch_sub(1, std::memory_order_acq_rel) == 1) - stop(); -} - -inline void -kqueue_scheduler::compensating_work_started() const noexcept -{ - auto* ctx = kqueue::find_context(this); - if (ctx) - ++ctx->private_outstanding_work; -} - -inline void -kqueue_scheduler::drain_thread_queue(op_queue& queue, std::int64_t count) const -{ - // Flush private work count to global counter — private posts - // only incremented the thread-local counter, not outstanding_work_ - if (count > 0) - outstanding_work_.fetch_add(count, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.splice(queue); - if (count > 0) - maybe_unlock_and_signal_one(lock); -} - -inline void -kqueue_scheduler::post_deferred_completions(op_queue& ops) const -{ - if (ops.empty()) - return; - - // Fast path: if on scheduler thread, use private queue - if (auto* ctx = kqueue::find_context(this)) - { - ctx->private_queue.splice(ops); - return; - } - - // Slow path: add to global queue and wake a thread - std::unique_lock lock(mutex_); - completed_ops_.splice(ops); - wake_one_thread_and_unlock(lock); -} - inline void kqueue_scheduler::interrupt_reactor() const { - // Only trigger if not already armed to avoid redundant triggers. - // acq_rel: release makes the true store visible to the reactor; - // acquire on failure sees the reactor's release store of false, - // preventing a stale-true read that would silently drop the trigger. - // On x86 (TSO) this compiles to the same LOCK CMPXCHG as before. bool expected = false; if (user_event_armed_.compare_exchange_strong( expected, true, std::memory_order_acq_rel, @@ -1084,87 +249,6 @@ kqueue_scheduler::interrupt_reactor() const } } -inline void -kqueue_scheduler::signal_all(std::unique_lock&) const -{ - state_ |= signaled_bit; - cond_.notify_all(); -} - -inline bool -kqueue_scheduler::maybe_unlock_and_signal_one( - std::unique_lock& lock) const -{ - state_ |= signaled_bit; - if (state_ > signaled_bit) - { - lock.unlock(); - cond_.notify_one(); - return true; - } - return false; -} - -inline bool -kqueue_scheduler::unlock_and_signal_one( - std::unique_lock& lock) const -{ - state_ |= signaled_bit; - bool have_waiters = state_ > signaled_bit; - lock.unlock(); - if (have_waiters) - cond_.notify_one(); - return have_waiters; -} - -inline void -kqueue_scheduler::clear_signal() const -{ - state_ &= ~signaled_bit; -} - -inline void -kqueue_scheduler::wait_for_signal(std::unique_lock& lock) const -{ - while ((state_ & signaled_bit) == 0) - { - state_ += waiter_increment; - cond_.wait(lock); - state_ -= waiter_increment; - } -} - -inline void -kqueue_scheduler::wait_for_signal_for( - std::unique_lock& lock, long timeout_us) const -{ - if ((state_ & signaled_bit) == 0) - { - state_ += waiter_increment; - cond_.wait_for(lock, std::chrono::microseconds(timeout_us)); - state_ -= waiter_increment; - } -} - -inline void -kqueue_scheduler::wake_one_thread_and_unlock( - std::unique_lock& lock) const -{ - if (maybe_unlock_and_signal_one(lock)) - return; - - if (task_running_ && !task_interrupted_) - { - task_interrupted_ = true; - lock.unlock(); - interrupt_reactor(); - } - else - { - lock.unlock(); - } -} - inline long kqueue_scheduler::calculate_timeout(long requested_timeout_us) const { @@ -1183,7 +267,6 @@ kqueue_scheduler::calculate_timeout(long requested_timeout_us) const std::chrono::duration_cast(nearest - now) .count(); - // Clamp to [0, LONG_MAX] to prevent truncation on 32-bit long platforms constexpr auto long_max = static_cast((std::numeric_limits::max)()); auto capped_timer_us = std::min( @@ -1192,59 +275,21 @@ kqueue_scheduler::calculate_timeout(long requested_timeout_us) const if (requested_timeout_us < 0) return static_cast(capped_timer_us); - // requested_timeout_us is already long, so min() result fits in long return static_cast(std::min( static_cast(requested_timeout_us), capped_timer_us)); } -inline kqueue_scheduler::work_cleanup::~work_cleanup() -{ - if (ctx) - { - std::int64_t produced = ctx->private_outstanding_work; - if (produced > 1) - scheduler->outstanding_work_.fetch_add( - produced - 1, std::memory_order_relaxed); - else if (produced < 1) - scheduler->work_finished(); - ctx->private_outstanding_work = 0; - - if (!ctx->private_queue.empty()) - { - lock->lock(); - scheduler->completed_ops_.splice(ctx->private_queue); - } - } - else - { - scheduler->work_finished(); - } -} - -inline kqueue_scheduler::task_cleanup::~task_cleanup() -{ - if (ctx && ctx->private_outstanding_work > 0) - { - scheduler->outstanding_work_.fetch_add( - ctx->private_outstanding_work, std::memory_order_relaxed); - ctx->private_outstanding_work = 0; - } -} - inline void kqueue_scheduler::run_task( - std::unique_lock& lock, kqueue::scheduler_context* ctx) + std::unique_lock& lock, context_type* ctx) { long effective_timeout_us = task_interrupted_ ? 0 : calculate_timeout(-1); if (lock.owns_lock()) lock.unlock(); - // Flush private work count when reactor completes - task_cleanup on_exit{this, ctx}; - (void)on_exit; + task_cleanup on_exit{this, &lock, ctx}; - // Convert timeout to timespec for kevent() struct timespec ts; struct timespec* ts_ptr = nullptr; if (effective_timeout_us >= 0) @@ -1254,7 +299,6 @@ kqueue_scheduler::run_task( ts_ptr = &ts; } - // Event loop runs without mutex held struct kevent events[128]; int nev = ::kevent(kq_fd_, nullptr, 0, events, 128, ts_ptr); int saved_errno = errno; @@ -1263,18 +307,11 @@ kqueue_scheduler::run_task( detail::throw_system_error(make_err(saved_errno), "kevent"); op_queue local_ops; - std::int64_t completions_queued = 0; - // Process events without holding the mutex for (int i = 0; i < nev; ++i) { if (events[i].filter == EVFILT_USER) { - // Interrupt event - clear the armed flag. - // Release pairs with the acquire CAS failure path in - // interrupt_reactor(), ensuring the reactor sees our - // store of false and can re-arm the EVFILT_USER trigger. - // On x86 (TSO) this compiles identically to relaxed. user_event_armed_.store(false, std::memory_order_release); continue; } @@ -1283,7 +320,6 @@ kqueue_scheduler::run_task( if (!desc) continue; - // Map kqueue events to ready-event flags std::uint32_t ready = 0; if (events[i].filter == EVFILT_READ) @@ -1294,155 +330,31 @@ kqueue_scheduler::run_task( if (events[i].flags & EV_ERROR) ready |= kqueue_event_error; - // EV_EOF: peer closed or error condition if (events[i].flags & EV_EOF) { - // EV_EOF on a read filter means the peer closed — deliver as - // a read event so the read returns 0 (EOF) if (events[i].filter == EVFILT_READ) ready |= kqueue_event_read; - // fflags contains the socket error (if any) when EV_EOF is set if (events[i].fflags != 0) ready |= kqueue_event_error; } desc->add_ready_events(ready); - // Only enqueue if not already enqueued. - // acq_rel on success: release makes add_ready_events visible - // to the consumer's acquire exchange; acquire pairs with the - // consumer's release store of false so we read the latest - // value. acquire on failure: ensures the CAS load sees the - // consumer's release store on ARM (prevents stale reads from - // the store buffer). On x86 (TSO) these compile identically - // to the weaker orderings. bool expected = false; if (desc->is_enqueued_.compare_exchange_strong( expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) { local_ops.push(desc); - ++completions_queued; } } - // Process timers after kevent returns timer_svc_->process_expired(); - // --- Acquire mutex only for queue operations --- lock.lock(); if (!local_ops.empty()) completed_ops_.splice(local_ops); - - // Drain private queue to global — flush work count BEFORE splicing - // so consumer threads can't decrement outstanding_work_ to zero - // before the count reflects the newly visible operations. - if (ctx && !ctx->private_queue.empty()) - { - if (ctx->private_outstanding_work > 0) - { - outstanding_work_.fetch_add( - ctx->private_outstanding_work, std::memory_order_relaxed); - completions_queued += ctx->private_outstanding_work; - ctx->private_outstanding_work = 0; - } - completed_ops_.splice(ctx->private_queue); - } - - // Signal and wake one waiter if work is queued - if (completions_queued > 0) - { - if (maybe_unlock_and_signal_one(lock)) - lock.lock(); - } -} - -inline std::size_t -kqueue_scheduler::do_one( - std::unique_lock& lock, - long timeout_us, - kqueue::scheduler_context* ctx) -{ - for (;;) - { - if (stopped_.load(std::memory_order_relaxed)) - return 0; - - scheduler_op* op = completed_ops_.pop(); - - // Handle reactor sentinel - time to poll for I/O - if (op == &task_op_) - { - bool more_handlers = - !completed_ops_.empty() || (ctx && !ctx->private_queue.empty()); - - // Nothing to run the reactor for: no pending work to wait on, - // or caller requested a non-blocking poll - if (!more_handlers && - (outstanding_work_.load(std::memory_order_acquire) == 0 || - timeout_us == 0)) - { - completed_ops_.push(&task_op_); - return 0; - } - - task_interrupted_ = more_handlers || timeout_us == 0; - task_running_ = true; - - if (more_handlers) - unlock_and_signal_one(lock); - - try - { - run_task(lock, ctx); - } - catch (...) - { - task_running_ = false; - throw; - } - - task_running_ = false; - completed_ops_.push(&task_op_); - continue; - } - - // Handle operation - if (op != nullptr) - { - bool more = !completed_ops_.empty(); - - if (more) - ctx->unassisted = !unlock_and_signal_one(lock); - else - { - ctx->unassisted = false; - lock.unlock(); - } - - work_cleanup on_exit{this, &lock, ctx}; - (void)on_exit; - - (*op)(); - return 1; - } - - // No work from global queue - try private queue before blocking - if (kqueue::drain_private_queue(ctx, outstanding_work_, completed_ops_)) - continue; - - // No pending work to wait on, or caller requested non-blocking poll - if (outstanding_work_.load(std::memory_order_acquire) == 0 || - timeout_us == 0) - return 0; - - clear_signal(); - if (timeout_us < 0) - wait_for_signal(lock); - else - wait_for_signal_for(lock, timeout_us); - } } } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_socket.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_socket.hpp index 02655765..3ec75772 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_socket.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_socket.hpp @@ -15,13 +15,9 @@ #if BOOST_COROSIO_HAS_KQUEUE -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { @@ -29,12 +25,19 @@ class kqueue_socket_service; /// Socket implementation for kqueue backend. class kqueue_socket final - : public tcp_socket::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_socket< + kqueue_socket, + kqueue_socket_service, + kqueue_op, + kqueue_connect_op, + kqueue_read_op, + kqueue_write_op, + descriptor_state> { friend class kqueue_socket_service; + bool user_set_linger_ = false; + public: explicit kqueue_socket(kqueue_socket_service& svc) noexcept; ~kqueue_socket(); @@ -62,70 +65,15 @@ class kqueue_socket final std::error_code*, std::size_t*) override; - std::error_code shutdown(tcp_socket::shutdown_type what) noexcept override; - - native_handle_type native_handle() const noexcept override - { - return fd_; - } - - // Socket options + /// Track SO_LINGER for macOS kqueue workaround. std::error_code set_option( int level, int optname, void const* data, std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - endpoint remote_endpoint() const noexcept override - { - return remote_endpoint_; - } - bool is_open() const noexcept - { - return fd_ >= 0; - } + void cancel() noexcept override; - void cancel_single_op(kqueue_op& op) noexcept; void close_socket() noexcept; - void set_socket(int fd) noexcept - { - fd_ = fd; - } - void set_endpoints(endpoint local, endpoint remote) noexcept - { - local_endpoint_ = local; - remote_endpoint_ = remote; - } - - // Public for internal integration with the scheduler and reactor — - // not part of the external API. The descriptor_state is accessed by - // the reactor thread (lock-free atomics) and by op completion under - // desc_state_.mutex; the op slots and initiators are only touched - // by the thread that owns the current I/O call. - kqueue_connect_op conn_; - kqueue_read_op rd_; - kqueue_write_op wr_; - descriptor_state desc_state_; - - void register_op( - kqueue_op& op, - kqueue_op*& desc_slot, - bool& ready_flag, - bool& cancel_flag) noexcept; - -private: - kqueue_socket_service& svc_; - int fd_ = -1; - bool user_set_linger_ = false; - endpoint local_endpoint_; - endpoint remote_endpoint_; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/kqueue/kqueue_socket_service.hpp b/include/boost/corosio/native/detail/kqueue/kqueue_socket_service.hpp index 2a74aebf..065c42c2 100644 --- a/include/boost/corosio/native/detail/kqueue/kqueue_socket_service.hpp +++ b/include/boost/corosio/native/detail/kqueue/kqueue_socket_service.hpp @@ -21,17 +21,13 @@ #include #include +#include -#include -#include -#include -#include -#include +#include #include #include #include -#include #include #include @@ -65,7 +61,7 @@ Impl Lifetime with shared_ptr ----------------------------- Socket impls use enable_shared_from_this. The service owns impls via - shared_ptr maps (socket_ptrs_) keyed by raw pointer for O(1) lookup and + shared_ptr maps (impl_ptrs_) keyed by raw pointer for O(1) lookup and removal. When a user calls close(), we call cancel() which posts pending ops to the scheduler. @@ -119,21 +115,9 @@ namespace boost::corosio::detail { -/** State for kqueue socket service. */ -class kqueue_socket_state -{ -public: - explicit kqueue_socket_state(kqueue_scheduler& sched) noexcept - : sched_(sched) - { - } - - kqueue_scheduler& sched_; - std::mutex mutex_; - intrusive_list socket_list_; - std::unordered_map> - socket_ptrs_; -}; +/// State for kqueue socket service. +using kqueue_socket_state = + reactor_service_state; /** kqueue socket service implementation. @@ -164,7 +148,7 @@ class BOOST_COROSIO_DECL kqueue_socket_service final : public socket_service { return state_->sched_; } - void post(kqueue_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -174,12 +158,6 @@ class BOOST_COROSIO_DECL kqueue_socket_service final : public socket_service // -- Implementation --------------------------------------------------------- -inline void -kqueue_op::canceller::operator()() const noexcept -{ - op->cancel(); -} - inline void kqueue_connect_op::cancel() noexcept { @@ -210,81 +188,17 @@ kqueue_write_op::cancel() noexcept inline void kqueue_op::operator()() { - stop_cb.reset(); - - socket_impl_->desc_state_.scheduler_->reset_inline_budget(); - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else if (is_read_operation() && bytes_transferred == 0) - *ec_out = capy::error::eof; - else - *ec_out = {}; - } - - if (bytes_out) - *bytes_out = bytes_transferred; - - // Move to stack before resuming coroutine. The coroutine might close - // the socket, releasing the last wrapper ref. If impl_ptr were the - // last ref and we destroyed it while still in operator(), we'd have - // use-after-free. Moving to local ensures destruction happens at - // function exit, after all member accesses are complete. - capy::executor_ref saved_ex(std::move(ex)); - std::coroutine_handle<> saved_h(std::move(h)); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_io_op(*this); } inline void kqueue_connect_op::operator()() { - stop_cb.reset(); - - socket_impl_->desc_state_.scheduler_->reset_inline_budget(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - // Cache endpoints on successful connect - if (success && socket_impl_) - { - endpoint local_ep; - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd, reinterpret_cast(&local_storage), &local_len) == - 0) - local_ep = from_sockaddr(local_storage); - static_cast(socket_impl_) - ->set_endpoints(local_ep, target_endpoint); - } - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - } - - if (bytes_out) - *bytes_out = bytes_transferred; - - // Move to stack before resuming. See kqueue_op::operator()() for rationale. - capy::executor_ref saved_ex(std::move(ex)); - std::coroutine_handle<> saved_h(std::move(h)); - auto prevent_premature_destruction = std::move(impl_ptr); - dispatch_coro(saved_ex, saved_h).resume(); + complete_connect_op(*this); } inline kqueue_socket::kqueue_socket(kqueue_socket_service& svc) noexcept - : svc_(svc) + : reactor_socket(svc) { } @@ -298,102 +212,7 @@ kqueue_socket::connect( std::stop_token token, std::error_code* ec) { - auto& op = conn_; - - sockaddr_storage storage{}; - socklen_t addrlen = - detail::to_sockaddr(ep, detail::socket_family(fd_), storage); - int result = ::connect(fd_, reinterpret_cast(&storage), addrlen); - - // Cache endpoints on sync success - if (result == 0) - { - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd_, reinterpret_cast(&local_storage), &local_len) == - 0) - local_endpoint_ = detail::from_sockaddr(local_storage); - remote_endpoint_ = ep; - } - - if (result == 0 || errno != EINPROGRESS) - { - int err = (result < 0) ? errno : 0; - - if (svc_.scheduler().try_consume_inline_budget()) - { - *ec = err ? make_err(err) : std::error_code{}; - return dispatch_coro(ex, h); - } - - // Budget exhausted — post through queue - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.fd = fd_; - op.target_endpoint = ep; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EINPROGRESS — async path - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.fd = fd_; - op.target_endpoint = ep; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.connect_op, desc_state_.write_ready, - desc_state_.connect_cancel_pending); - return std::noop_coroutine(); -} - -// Register an op with the reactor, handling cached edge events. -// Called under the EAGAIN path when speculative I/O failed. -inline void -kqueue_socket::register_op( - kqueue_op& op, - kqueue_op*& desc_slot, - bool& ready_flag, - bool& cancel_flag) noexcept -{ - svc_.work_started(); - - std::lock_guard lock(desc_state_.mutex); - bool io_done = false; - if (ready_flag) - { - ready_flag = false; - op.perform_io(); - io_done = (op.errn != EAGAIN && op.errn != EWOULDBLOCK); - if (!io_done) - op.errn = 0; - } - - if (cancel_flag) - { - cancel_flag = false; - op.cancelled.store(true, std::memory_order_relaxed); - } - - if (io_done || op.cancelled.load(std::memory_order_acquire)) - { - svc_.post(&op); - svc_.work_finished(); - } - else - { - desc_slot = &op; - } + return do_connect(h, ex, ep, token, ec); } inline std::coroutine_handle<> @@ -405,87 +224,7 @@ kqueue_socket::read_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = rd_; - op.reset(); - - capy::mutable_buffer bufs[kqueue_read_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, kqueue_read_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.empty_buffer_read = true; - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(0, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - // Speculative read: try I/O before suspending. On success, return via - // symmetric transfer without touching the scheduler queue — this creates - // a tight pump loop for back-to-back reads on a hot socket. - // Budget limits consecutive inline completions to prevent starvation - // of other connections competing for scheduler time. - ssize_t n; - do - { - n = ::readv(fd_, op.iovecs, op.iovec_count); - } - while (n < 0 && errno == EINTR); - - if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) - { - int err = (n < 0) ? errno : 0; - auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); - - if (svc_.scheduler().try_consume_inline_budget()) - { - if (err) - *ec = make_err(err); - else if (n == 0) - *ec = capy::error::eof; - else - *ec = {}; - *bytes_out = bytes; - return dispatch_coro(ex, h); - } - - // Budget exhausted — fall through to queue - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, bytes); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EAGAIN — register with reactor - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.read_op, desc_state_.read_ready, - desc_state_.read_cancel_pending); - return std::noop_coroutine(); + return do_read_some(h, ex, param, token, ec, bytes_out); } inline std::coroutine_handle<> @@ -497,103 +236,7 @@ kqueue_socket::write_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = wr_; - op.reset(); - - capy::mutable_buffer bufs[kqueue_write_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, kqueue_write_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(0, 0); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - // Speculative write: try I/O before suspending. On success, return via - // symmetric transfer without touching the scheduler queue — this creates - // a tight pump loop for back-to-back writes on a hot socket. - // Budget limits consecutive inline completions to prevent starvation. - ssize_t n; - do - { - n = ::writev(fd_, op.iovecs, op.iovec_count); - } - while (n < 0 && errno == EINTR); - - if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) - { - int err = (n < 0) ? errno : 0; - auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); - - if (svc_.scheduler().try_consume_inline_budget()) - { - *ec = err ? make_err(err) : std::error_code{}; - *bytes_out = bytes; - return dispatch_coro(ex, h); - } - - // Budget exhausted — fall through to queue - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.start(token, this); - op.impl_ptr = shared_from_this(); - op.complete(err, bytes); - svc_.post(&op); - return std::noop_coroutine(); - } - - // EAGAIN — register with reactor - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - op.impl_ptr = shared_from_this(); - - register_op( - op, desc_state_.write_op, desc_state_.write_ready, - desc_state_.write_cancel_pending); - return std::noop_coroutine(); -} - -inline std::error_code -kqueue_socket::shutdown(tcp_socket::shutdown_type what) noexcept -{ - int how; - switch (what) - { - case tcp_socket::shutdown_receive: - how = SHUT_RD; - break; - case tcp_socket::shutdown_send: - how = SHUT_WR; - break; - case tcp_socket::shutdown_both: - how = SHUT_RDWR; - break; - default: - return make_err(EINVAL); - } - if (::shutdown(fd_, how) != 0) - return make_err(errno); - return {}; + return do_write_some(h, ex, param, token, ec, bytes_out); } inline std::error_code @@ -610,167 +253,17 @@ kqueue_socket::set_option( return {}; } -inline std::error_code -kqueue_socket::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; -} - inline void kqueue_socket::cancel() noexcept { - auto self = weak_from_this().lock(); - if (!self) - return; - - conn_.request_cancel(); - rd_.request_cancel(); - wr_.request_cancel(); - - kqueue_op* conn_claimed = nullptr; - kqueue_op* rd_claimed = nullptr; - kqueue_op* wr_claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (desc_state_.connect_op == &conn_) - conn_claimed = std::exchange(desc_state_.connect_op, nullptr); - else - desc_state_.connect_cancel_pending = true; - if (desc_state_.read_op == &rd_) - rd_claimed = std::exchange(desc_state_.read_op, nullptr); - else - desc_state_.read_cancel_pending = true; - if (desc_state_.write_op == &wr_) - wr_claimed = std::exchange(desc_state_.write_op, nullptr); - else - desc_state_.write_cancel_pending = true; - } - - if (conn_claimed) - { - conn_.impl_ptr = self; - svc_.post(&conn_); - svc_.work_finished(); - } - if (rd_claimed) - { - rd_.impl_ptr = self; - svc_.post(&rd_); - svc_.work_finished(); - } - if (wr_claimed) - { - wr_.impl_ptr = self; - svc_.post(&wr_); - svc_.work_finished(); - } -} - -inline void -kqueue_socket::cancel_single_op(kqueue_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - op.request_cancel(); - - kqueue_op** desc_op_ptr = nullptr; - if (&op == &conn_) - desc_op_ptr = &desc_state_.connect_op; - else if (&op == &rd_) - desc_op_ptr = &desc_state_.read_op; - else if (&op == &wr_) - desc_op_ptr = &desc_state_.write_op; - - if (desc_op_ptr) - { - kqueue_op* claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - if (*desc_op_ptr == &op) - claimed = std::exchange(*desc_op_ptr, nullptr); - else if (&op == &conn_) - desc_state_.connect_cancel_pending = true; - else if (&op == &rd_) - desc_state_.read_cancel_pending = true; - else if (&op == &wr_) - desc_state_.write_cancel_pending = true; - } - if (claimed) - { - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } - } + do_cancel(); } inline void kqueue_socket::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - conn_.request_cancel(); - rd_.request_cancel(); - wr_.request_cancel(); - - kqueue_op* conn_claimed = nullptr; - kqueue_op* rd_claimed = nullptr; - kqueue_op* wr_claimed = nullptr; - { - std::lock_guard lock(desc_state_.mutex); - conn_claimed = std::exchange(desc_state_.connect_op, nullptr); - rd_claimed = std::exchange(desc_state_.read_op, nullptr); - wr_claimed = std::exchange(desc_state_.write_op, nullptr); - desc_state_.read_ready = false; - desc_state_.write_ready = false; - desc_state_.read_cancel_pending = false; - desc_state_.write_cancel_pending = false; - desc_state_.connect_cancel_pending = false; - } - - if (conn_claimed) - { - conn_.impl_ptr = self; - svc_.post(&conn_); - svc_.work_finished(); - } - if (rd_claimed) - { - rd_.impl_ptr = self; - svc_.post(&rd_); - svc_.work_finished(); - } - if (wr_claimed) - { - wr_.impl_ptr = self; - svc_.post(&wr_); - svc_.work_finished(); - } - - if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) - desc_state_.impl_ref_ = self; - } - - if (fd_ >= 0) - { - ::close(fd_); - fd_ = -1; - } - - desc_state_.fd = -1; - desc_state_.registered_events = 0; - user_set_linger_ = false; - - local_endpoint_ = endpoint{}; - remote_endpoint_ = endpoint{}; + do_close_socket(); + user_set_linger_ = false; } inline kqueue_socket_service::kqueue_socket_service( @@ -788,7 +281,7 @@ kqueue_socket_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->socket_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) { if (impl->user_set_linger_ && impl->fd_ >= 0) { @@ -800,7 +293,7 @@ kqueue_socket_service::shutdown() impl->close_socket(); } - // Don't clear socket_ptrs_ here. The scheduler shuts down after us and + // Don't clear impl_ptrs_ here. The scheduler shuts down after us and // drains completed_ops_, calling destroy() on each queued op. If we // released our shared_ptrs now, a kqueue_op::destroy() could free the // last ref to an impl whose embedded descriptor_state is still linked @@ -817,8 +310,8 @@ kqueue_socket_service::construct() { std::lock_guard lock(state_->mutex_); - state_->socket_list_.push_back(raw); - state_->socket_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); } return raw; @@ -842,8 +335,8 @@ kqueue_socket_service::destroy(io_object::implementation* impl) kq_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->socket_list_.remove(kq_impl); - state_->socket_ptrs_.erase(kq_impl); + state_->impl_list_.remove(kq_impl); + state_->impl_ptrs_.erase(kq_impl); } inline std::error_code @@ -918,7 +411,7 @@ kqueue_socket_service::close(io_object::handle& h) } inline void -kqueue_socket_service::post(kqueue_op* op) +kqueue_socket_service::post(scheduler_op* op) { state_->sched_.post(op); } diff --git a/include/boost/corosio/native/detail/reactor/reactor_acceptor.hpp b/include/boost/corosio/native/detail/reactor/reactor_acceptor.hpp new file mode 100644 index 00000000..130921fa --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_acceptor.hpp @@ -0,0 +1,306 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_ACCEPTOR_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_ACCEPTOR_HPP + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace boost::corosio::detail { + +/** CRTP base for reactor-backed acceptor implementations. + + Provides shared data members, trivial virtual overrides, and + non-virtual helper methods for cancellation and close. Concrete + backends inherit and add `cancel()`, `close_socket()`, and + `accept()` overrides that delegate to the `do_*` helpers. + + @tparam Derived The concrete acceptor type (CRTP). + @tparam Service The backend's acceptor service type. + @tparam Op The backend's base op type. + @tparam AcceptOp The backend's accept op type. + @tparam DescState The backend's descriptor_state type. +*/ +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +class reactor_acceptor + : public tcp_acceptor::implementation + , public std::enable_shared_from_this + , public intrusive_list::node +{ + friend Derived; + + explicit reactor_acceptor(Service& svc) noexcept : svc_(svc) {} + +protected: + Service& svc_; + int fd_ = -1; + endpoint local_endpoint_; + +public: + /// Pending accept operation slot. + AcceptOp acc_; + + /// Per-descriptor state for persistent reactor registration. + DescState desc_state_; + + ~reactor_acceptor() override = default; + + /// Return the underlying file descriptor. + int native_handle() const noexcept + { + return fd_; + } + + /// Return the cached local endpoint. + endpoint local_endpoint() const noexcept override + { + return local_endpoint_; + } + + /// Return true if the acceptor has an open file descriptor. + bool is_open() const noexcept override + { + return fd_ >= 0; + } + + /// Set a socket option. + std::error_code set_option( + int level, + int optname, + void const* data, + std::size_t size) noexcept override + { + if (::setsockopt( + fd_, level, optname, data, static_cast(size)) != 0) + return make_err(errno); + return {}; + } + + /// Get a socket option. + std::error_code + get_option(int level, int optname, void* data, std::size_t* size) + const noexcept override + { + socklen_t len = static_cast(*size); + if (::getsockopt(fd_, level, optname, data, &len) != 0) + return make_err(errno); + *size = static_cast(len); + return {}; + } + + /// Cache the local endpoint. + void set_local_endpoint(endpoint ep) noexcept + { + local_endpoint_ = ep; + } + + /// Return a reference to the owning service. + Service& service() noexcept + { + return svc_; + } + + /** Cancel a single pending operation. + + Claims the operation from the read_op descriptor slot + under the mutex and posts it to the scheduler as cancelled. + + @param op The operation to cancel. + */ + void cancel_single_op(Op& op) noexcept; + + /** Cancel the pending accept operation. + + Invoked by the derived class's cancel() override. + */ + void do_cancel() noexcept; + + /** Close the acceptor and cancel pending operations. + + Invoked by the derived class's close_socket(). The + derived class may add backend-specific cleanup after + calling this method. + */ + void do_close_socket() noexcept; + + /** Bind the acceptor socket to an endpoint. + + Caches the resolved local endpoint (including ephemeral + port) after a successful bind. + + @param ep The endpoint to bind to. + @return The error code from bind(), or success. + */ + std::error_code do_bind(endpoint ep); + + /** Start listening on the acceptor socket. + + Registers the file descriptor with the reactor after + a successful listen() call. + + @param backlog The listen backlog. + @return The error code from listen(), or success. + */ + std::error_code do_listen(int backlog); +}; + +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +void +reactor_acceptor::cancel_single_op( + Op& op) noexcept +{ + auto self = this->weak_from_this().lock(); + if (!self) + return; + + op.request_cancel(); + + reactor_op_base* claimed = nullptr; + { + std::lock_guard lock(desc_state_.mutex); + if (desc_state_.read_op == &op) + claimed = std::exchange(desc_state_.read_op, nullptr); + } + if (claimed) + { + op.impl_ptr = self; + svc_.post(&op); + svc_.work_finished(); + } +} + +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +void +reactor_acceptor:: + do_cancel() noexcept +{ + cancel_single_op(acc_); +} + +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +void +reactor_acceptor:: + do_close_socket() noexcept +{ + auto self = this->weak_from_this().lock(); + if (self) + { + acc_.request_cancel(); + + reactor_op_base* claimed = nullptr; + { + std::lock_guard lock(desc_state_.mutex); + claimed = std::exchange(desc_state_.read_op, nullptr); + desc_state_.read_ready = false; + desc_state_.write_ready = false; + + if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) + desc_state_.impl_ref_ = self; + } + + if (claimed) + { + acc_.impl_ptr = self; + svc_.post(&acc_); + svc_.work_finished(); + } + } + + if (fd_ >= 0) + { + if (desc_state_.registered_events != 0) + svc_.scheduler().deregister_descriptor(fd_); + ::close(fd_); + fd_ = -1; + } + + desc_state_.fd = -1; + desc_state_.registered_events = 0; + + local_endpoint_ = endpoint{}; +} + +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +std::error_code +reactor_acceptor::do_bind( + endpoint ep) +{ + sockaddr_storage storage{}; + socklen_t addrlen = to_sockaddr(ep, storage); + if (::bind(fd_, reinterpret_cast(&storage), addrlen) < 0) + return make_err(errno); + + // Cache local endpoint (resolves ephemeral port) + sockaddr_storage local{}; + socklen_t local_len = sizeof(local); + if (::getsockname(fd_, reinterpret_cast(&local), &local_len) == + 0) + set_local_endpoint(from_sockaddr(local)); + + return {}; +} + +template< + class Derived, + class Service, + class Op, + class AcceptOp, + class DescState> +std::error_code +reactor_acceptor::do_listen( + int backlog) +{ + if (::listen(fd_, backlog) < 0) + return make_err(errno); + + svc_.scheduler().register_descriptor(fd_, &desc_state_); + return {}; +} + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_ACCEPTOR_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_descriptor_state.hpp b/include/boost/corosio/native/detail/reactor/reactor_descriptor_state.hpp new file mode 100644 index 00000000..d434cd7b --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_descriptor_state.hpp @@ -0,0 +1,258 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_DESCRIPTOR_STATE_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_DESCRIPTOR_STATE_HPP + +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace boost::corosio::detail { + +/// Shared reactor event constants. +/// These match epoll numeric values; kqueue maps its events to the same. +static constexpr std::uint32_t reactor_event_read = 0x001; +static constexpr std::uint32_t reactor_event_write = 0x004; +static constexpr std::uint32_t reactor_event_error = 0x008; + +/** Per-descriptor state shared across reactor backends. + + Tracks pending operations for a file descriptor. The fd is registered + once with the reactor and stays registered until closed. Uses deferred + I/O: the reactor sets ready_events atomically, then enqueues this state. + When popped by the scheduler, invoke_deferred_io() performs I/O under + the mutex and queues completed ops. + + Non-template: uses reactor_op_base pointers so the scheduler and + descriptor_state code exist as a single copy in the binary regardless + of how many backends are compiled in. + + @par Thread Safety + The mutex protects operation pointers and ready flags. ready_events_ + and is_enqueued_ are atomic for lock-free reactor access. +*/ +struct reactor_descriptor_state : scheduler_op +{ + /// Protects operation pointers and ready/cancel flags. + std::mutex mutex; + + /// Pending read operation (guarded by `mutex`). + reactor_op_base* read_op = nullptr; + + /// Pending write operation (guarded by `mutex`). + reactor_op_base* write_op = nullptr; + + /// Pending connect operation (guarded by `mutex`). + reactor_op_base* connect_op = nullptr; + + /// True if a read edge event arrived before an op was registered. + bool read_ready = false; + + /// True if a write edge event arrived before an op was registered. + bool write_ready = false; + + /// Deferred read cancellation (IOCP-style cancel semantics). + bool read_cancel_pending = false; + + /// Deferred write cancellation (IOCP-style cancel semantics). + bool write_cancel_pending = false; + + /// Deferred connect cancellation (IOCP-style cancel semantics). + bool connect_cancel_pending = false; + + /// Event mask set during registration (no mutex needed). + std::uint32_t registered_events = 0; + + /// File descriptor this state tracks. + int fd = -1; + + /// Accumulated ready events (set by reactor, read by scheduler). + std::atomic ready_events_{0}; + + /// True while this state is queued in the scheduler's completed_ops. + std::atomic is_enqueued_{false}; + + /// Owning scheduler for posting completions. + reactor_scheduler_base const* scheduler_ = nullptr; + + /// Prevents impl destruction while queued in the scheduler. + std::shared_ptr impl_ref_; + + /// Add ready events atomically. + /// Release pairs with the consumer's acquire exchange on + /// ready_events_ so the consumer sees all flags. On x86 (TSO) + /// this compiles to the same LOCK OR as relaxed. + void add_ready_events(std::uint32_t ev) noexcept + { + ready_events_.fetch_or(ev, std::memory_order_release); + } + + /// Invoke deferred I/O and dispatch completions. + void operator()() override + { + invoke_deferred_io(); + } + + /// Destroy without invoking. + /// Called during scheduler::shutdown() drain. Clear impl_ref_ to break + /// the self-referential cycle set by close_socket(). + void destroy() override + { + impl_ref_.reset(); + } + + /** Perform deferred I/O and queue completions. + + Performs I/O under the mutex and queues completed ops. EAGAIN + ops stay parked in their slot for re-delivery on the next + edge event. + */ + void invoke_deferred_io(); +}; + +inline void +reactor_descriptor_state::invoke_deferred_io() +{ + std::shared_ptr prevent_impl_destruction; + op_queue local_ops; + + { + std::lock_guard lock(mutex); + + // Must clear is_enqueued_ and move impl_ref_ under the same + // lock that processes I/O. close_socket() checks is_enqueued_ + // under this mutex — without atomicity between the flag store + // and the ref move, close_socket() could see is_enqueued_==false, + // skip setting impl_ref_, and destroy the impl under us. + prevent_impl_destruction = std::move(impl_ref_); + is_enqueued_.store(false, std::memory_order_release); + + std::uint32_t ev = + ready_events_.exchange(0, std::memory_order_acquire); + if (ev == 0) + { + // Mutex unlocks here; compensate for work_cleanup's decrement + scheduler_->compensating_work_started(); + return; + } + + int err = 0; + if (ev & reactor_event_error) + { + socklen_t len = sizeof(err); + if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) + err = errno; + if (err == 0) + err = EIO; + } + + if (ev & reactor_event_read) + { + if (read_op) + { + auto* rd = read_op; + if (err) + rd->complete(err, 0); + else + rd->perform_io(); + + if (rd->errn == EAGAIN || rd->errn == EWOULDBLOCK) + { + rd->errn = 0; + } + else + { + read_op = nullptr; + local_ops.push(rd); + } + } + else + { + read_ready = true; + } + } + if (ev & reactor_event_write) + { + bool had_write_op = (connect_op || write_op); + if (connect_op) + { + auto* cn = connect_op; + if (err) + cn->complete(err, 0); + else + cn->perform_io(); + connect_op = nullptr; + local_ops.push(cn); + } + if (write_op) + { + auto* wr = write_op; + if (err) + wr->complete(err, 0); + else + wr->perform_io(); + + if (wr->errn == EAGAIN || wr->errn == EWOULDBLOCK) + { + wr->errn = 0; + } + else + { + write_op = nullptr; + local_ops.push(wr); + } + } + if (!had_write_op) + write_ready = true; + } + if (err) + { + if (read_op) + { + read_op->complete(err, 0); + local_ops.push(std::exchange(read_op, nullptr)); + } + if (write_op) + { + write_op->complete(err, 0); + local_ops.push(std::exchange(write_op, nullptr)); + } + if (connect_op) + { + connect_op->complete(err, 0); + local_ops.push(std::exchange(connect_op, nullptr)); + } + } + } + + // Execute first handler inline — the scheduler's work_cleanup + // accounts for this as the "consumed" work item + scheduler_op* first = local_ops.pop(); + if (first) + { + scheduler_->post_deferred_completions(local_ops); + (*first)(); + } + else + { + scheduler_->compensating_work_started(); + } +} + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_DESCRIPTOR_STATE_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_op.hpp b/include/boost/corosio/native/detail/reactor/reactor_op.hpp new file mode 100644 index 00000000..a74412d3 --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_op.hpp @@ -0,0 +1,309 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace boost::corosio::detail { + +/** Base operation for reactor-based backends. + + Holds per-operation state that depends on the concrete backend + socket/acceptor types: coroutine handle, executor, output + pointers, file descriptor, stop_callback, and type-specific + impl pointers. + + Fields shared across all backends (errn, bytes_transferred, + cancelled, impl_ptr, perform_io, complete) live in + reactor_op_base so the scheduler and descriptor_state can + access them without template instantiation. + + @tparam Socket The backend socket impl type (forward-declared). + @tparam Acceptor The backend acceptor impl type (forward-declared). +*/ +template +struct reactor_op : reactor_op_base +{ + /// Stop-token callback that invokes cancel() on the target op. + struct canceller + { + reactor_op* op; + void operator()() const noexcept + { + op->cancel(); + } + }; + + /// Caller's coroutine handle to resume on completion. + std::coroutine_handle<> h; + + /// Executor for dispatching the completion. + capy::executor_ref ex; + + /// Output pointer for the error code. + std::error_code* ec_out = nullptr; + + /// Output pointer for bytes transferred. + std::size_t* bytes_out = nullptr; + + /// File descriptor this operation targets. + int fd = -1; + + /// Stop-token callback registration. + std::optional> stop_cb; + + /// Owning socket impl (for stop_token cancellation). + Socket* socket_impl_ = nullptr; + + /// Owning acceptor impl (for stop_token cancellation). + Acceptor* acceptor_impl_ = nullptr; + + reactor_op() = default; + + /// Reset operation state for reuse. + void reset() noexcept + { + fd = -1; + errn = 0; + bytes_transferred = 0; + cancelled.store(false, std::memory_order_relaxed); + impl_ptr.reset(); + socket_impl_ = nullptr; + acceptor_impl_ = nullptr; + } + + /// Return true if this is a read-direction operation. + virtual bool is_read_operation() const noexcept + { + return false; + } + + /// Cancel this operation via the owning impl. + virtual void cancel() noexcept = 0; + + /// Destroy without invoking. + void destroy() override + { + stop_cb.reset(); + reactor_op_base::destroy(); + } + + /// Arm the stop-token callback for a socket operation. + void start(std::stop_token const& token, Socket* impl) + { + cancelled.store(false, std::memory_order_release); + stop_cb.reset(); + socket_impl_ = impl; + acceptor_impl_ = nullptr; + + if (token.stop_possible()) + stop_cb.emplace(token, canceller{this}); + } + + /// Arm the stop-token callback for an acceptor operation. + void start(std::stop_token const& token, Acceptor* impl) + { + cancelled.store(false, std::memory_order_release); + stop_cb.reset(); + socket_impl_ = nullptr; + acceptor_impl_ = impl; + + if (token.stop_possible()) + stop_cb.emplace(token, canceller{this}); + } +}; + +/** Shared connect operation. + + Checks SO_ERROR for connect completion status. The operator()() + and cancel() are provided by the concrete backend type. + + @tparam Base The backend's base op type. +*/ +template +struct reactor_connect_op : Base +{ + /// Endpoint to connect to. + endpoint target_endpoint; + + /// Reset operation state for reuse. + void reset() noexcept + { + Base::reset(); + target_endpoint = endpoint{}; + } + + void perform_io() noexcept override + { + int err = 0; + socklen_t len = sizeof(err); + if (::getsockopt(this->fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) + err = errno; + this->complete(err, 0); + } +}; + +/** Shared scatter-read operation. + + Uses readv() with an EINTR retry loop. + + @tparam Base The backend's base op type. +*/ +template +struct reactor_read_op : Base +{ + /// Maximum scatter-gather buffer count. + static constexpr std::size_t max_buffers = 16; + + /// Scatter-gather I/O vectors. + iovec iovecs[max_buffers]; + + /// Number of active I/O vectors. + int iovec_count = 0; + + /// True for zero-length reads (completed immediately). + bool empty_buffer_read = false; + + /// Return true (this is a read-direction operation). + bool is_read_operation() const noexcept override + { + return !empty_buffer_read; + } + + void reset() noexcept + { + Base::reset(); + iovec_count = 0; + empty_buffer_read = false; + } + + void perform_io() noexcept override + { + ssize_t n; + do + { + n = ::readv(this->fd, iovecs, iovec_count); + } + while (n < 0 && errno == EINTR); + + if (n >= 0) + this->complete(0, static_cast(n)); + else + this->complete(errno, 0); + } +}; + +/** Shared gather-write operation. + + Delegates the actual syscall to WritePolicy::write(fd, iovecs, count), + which returns ssize_t (bytes written or -1 with errno set). + + @tparam Base The backend's base op type. + @tparam WritePolicy Provides `static ssize_t write(int, iovec*, int)`. +*/ +template +struct reactor_write_op : Base +{ + /// The write syscall policy type. + using write_policy = WritePolicy; + + /// Maximum scatter-gather buffer count. + static constexpr std::size_t max_buffers = 16; + + /// Scatter-gather I/O vectors. + iovec iovecs[max_buffers]; + + /// Number of active I/O vectors. + int iovec_count = 0; + + void reset() noexcept + { + Base::reset(); + iovec_count = 0; + } + + void perform_io() noexcept override + { + ssize_t n = WritePolicy::write(this->fd, iovecs, iovec_count); + if (n >= 0) + this->complete(0, static_cast(n)); + else + this->complete(errno, 0); + } +}; + +/** Shared accept operation. + + Delegates the actual syscall to AcceptPolicy::do_accept(fd, peer_storage), + which returns the accepted fd or -1 with errno set. + + @tparam Base The backend's base op type. + @tparam AcceptPolicy Provides `static int do_accept(int, sockaddr_storage&)`. +*/ +template +struct reactor_accept_op : Base +{ + /// File descriptor of the accepted connection. + int accepted_fd = -1; + + /// Pointer to the peer socket implementation. + io_object::implementation* peer_impl = nullptr; + + /// Output pointer for the accepted implementation. + io_object::implementation** impl_out = nullptr; + + /// Peer address storage filled by accept. + sockaddr_storage peer_storage{}; + + void reset() noexcept + { + Base::reset(); + accepted_fd = -1; + peer_impl = nullptr; + impl_out = nullptr; + peer_storage = {}; + } + + void perform_io() noexcept override + { + int new_fd = AcceptPolicy::do_accept(this->fd, peer_storage); + if (new_fd >= 0) + { + accepted_fd = new_fd; + this->complete(0, 0); + } + else + { + this->complete(errno, 0); + } + } +}; + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_op_base.hpp b/include/boost/corosio/native/detail/reactor/reactor_op_base.hpp new file mode 100644 index 00000000..5690ecc2 --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_op_base.hpp @@ -0,0 +1,69 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_BASE_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_BASE_HPP + +#include + +#include +#include +#include + +namespace boost::corosio::detail { + +/** Non-template base for reactor operations. + + Holds per-operation state accessed by reactor_descriptor_state + and reactor_socket without requiring knowledge of the concrete + backend socket/acceptor types. This avoids duplicate template + instantiations for the descriptor_state and scheduler hot paths. + + @see reactor_op +*/ +struct reactor_op_base : scheduler_op +{ + /// Errno from the last I/O attempt. + int errn = 0; + + /// Bytes transferred on success. + std::size_t bytes_transferred = 0; + + /// True when cancellation has been requested. + std::atomic cancelled{false}; + + /// Prevents use-after-free when socket is closed with pending ops. + std::shared_ptr impl_ptr; + + /// Record the result of an I/O attempt. + void complete(int err, std::size_t bytes) noexcept + { + errn = err; + bytes_transferred = bytes; + } + + /// Perform the I/O syscall (overridden by concrete op types). + virtual void perform_io() noexcept {} + + /// Mark as cancelled (visible to the I/O completion path). + void request_cancel() noexcept + { + cancelled.store(true, std::memory_order_release); + } + + /// Destroy without invoking. + void destroy() override + { + impl_ptr.reset(); + } +}; + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_BASE_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_op_complete.hpp b/include/boost/corosio/native/detail/reactor/reactor_op_complete.hpp new file mode 100644 index 00000000..bc0d35ac --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_op_complete.hpp @@ -0,0 +1,216 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_COMPLETE_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_COMPLETE_HPP + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace boost::corosio::detail { + +/** Complete a base read/write operation. + + Translates the recorded errno and cancellation state into + an error_code, stores the byte count, then resumes the + caller via symmetric transfer. + + @tparam Op The concrete operation type. + @param op The operation to complete. +*/ +template +void +complete_io_op(Op& op) +{ + op.stop_cb.reset(); + op.socket_impl_->desc_state_.scheduler_->reset_inline_budget(); + + if (op.cancelled.load(std::memory_order_acquire)) + *op.ec_out = capy::error::canceled; + else if (op.errn != 0) + *op.ec_out = make_err(op.errn); + else if (op.is_read_operation() && op.bytes_transferred == 0) + *op.ec_out = capy::error::eof; + else + *op.ec_out = {}; + + *op.bytes_out = op.bytes_transferred; + + capy::executor_ref saved_ex(op.ex); + std::coroutine_handle<> saved_h(op.h); + auto prevent = std::move(op.impl_ptr); + dispatch_coro(saved_ex, saved_h).resume(); +} + +/** Complete a connect operation with endpoint caching. + + On success, queries the local endpoint via getsockname and + caches both endpoints in the socket impl. Then resumes the + caller via symmetric transfer. + + @tparam Op The concrete connect operation type. + @param op The operation to complete. +*/ +template +void +complete_connect_op(Op& op) +{ + op.stop_cb.reset(); + op.socket_impl_->desc_state_.scheduler_->reset_inline_budget(); + + bool success = + (op.errn == 0 && !op.cancelled.load(std::memory_order_acquire)); + + if (success && op.socket_impl_) + { + endpoint local_ep; + sockaddr_storage local_storage{}; + socklen_t local_len = sizeof(local_storage); + if (::getsockname( + op.fd, + reinterpret_cast(&local_storage), + &local_len) == 0) + local_ep = from_sockaddr(local_storage); + op.socket_impl_->set_endpoints(local_ep, op.target_endpoint); + } + + if (op.cancelled.load(std::memory_order_acquire)) + *op.ec_out = capy::error::canceled; + else if (op.errn != 0) + *op.ec_out = make_err(op.errn); + else + *op.ec_out = {}; + + capy::executor_ref saved_ex(op.ex); + std::coroutine_handle<> saved_h(op.h); + auto prevent = std::move(op.impl_ptr); + dispatch_coro(saved_ex, saved_h).resume(); +} + +/** Construct and register a peer socket from an accepted fd. + + Creates a new socket impl via the acceptor's associated + socket service, registers it with the scheduler, and caches + the local and remote endpoints. + + @tparam SocketImpl The concrete socket implementation type. + @tparam AcceptorImpl The concrete acceptor implementation type. + @param acceptor_impl The acceptor that accepted the connection. + @param accepted_fd The accepted file descriptor (set to -1 on success). + @param peer_storage The peer address from accept(). + @param impl_out Output pointer for the new socket impl. + @param ec_out Output pointer for any error. + @return True on success, false on failure. +*/ +template +bool +setup_accepted_socket( + AcceptorImpl* acceptor_impl, + int& accepted_fd, + sockaddr_storage const& peer_storage, + io_object::implementation** impl_out, + std::error_code* ec_out) +{ + auto* socket_svc = acceptor_impl->service().socket_service(); + if (!socket_svc) + { + *ec_out = make_err(ENOENT); + return false; + } + + auto& impl = static_cast(*socket_svc->construct()); + impl.set_socket(accepted_fd); + + impl.desc_state_.fd = accepted_fd; + { + std::lock_guard lock(impl.desc_state_.mutex); + impl.desc_state_.read_op = nullptr; + impl.desc_state_.write_op = nullptr; + impl.desc_state_.connect_op = nullptr; + } + socket_svc->scheduler().register_descriptor( + accepted_fd, &impl.desc_state_); + + impl.set_endpoints( + acceptor_impl->local_endpoint(), + from_sockaddr(peer_storage)); + + if (impl_out) + *impl_out = &impl; + accepted_fd = -1; + return true; +} + +/** Complete an accept operation. + + Sets up the peer socket on success, or closes the accepted + fd on failure. Then resumes the caller via symmetric transfer. + + @tparam SocketImpl The concrete socket implementation type. + @tparam Op The concrete accept operation type. + @param op The operation to complete. +*/ +template +void +complete_accept_op(Op& op) +{ + op.stop_cb.reset(); + op.acceptor_impl_->desc_state_.scheduler_->reset_inline_budget(); + + bool success = + (op.errn == 0 && !op.cancelled.load(std::memory_order_acquire)); + + if (op.cancelled.load(std::memory_order_acquire)) + *op.ec_out = capy::error::canceled; + else if (op.errn != 0) + *op.ec_out = make_err(op.errn); + else + *op.ec_out = {}; + + if (success && op.accepted_fd >= 0 && op.acceptor_impl_) + { + if (!setup_accepted_socket( + op.acceptor_impl_, + op.accepted_fd, + op.peer_storage, + op.impl_out, + op.ec_out)) + success = false; + } + + if (!success || !op.acceptor_impl_) + { + if (op.accepted_fd >= 0) + { + ::close(op.accepted_fd); + op.accepted_fd = -1; + } + if (op.impl_out) + *op.impl_out = nullptr; + } + + capy::executor_ref saved_ex(op.ex); + std::coroutine_handle<> saved_h(op.h); + auto prevent = std::move(op.impl_ptr); + dispatch_coro(saved_ex, saved_h).resume(); +} + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_OP_COMPLETE_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_scheduler.hpp b/include/boost/corosio/native/detail/reactor/reactor_scheduler.hpp new file mode 100644 index 00000000..0e6c50f0 --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_scheduler.hpp @@ -0,0 +1,837 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SCHEDULER_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SCHEDULER_HPP + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace boost::corosio::detail { + +// Forward declaration +class reactor_scheduler_base; + +/** Per-thread state for a reactor scheduler. + + Each thread running a scheduler's event loop has one of these + on a thread-local stack. It holds a private work queue and + inline completion budget for speculative I/O fast paths. +*/ +struct BOOST_COROSIO_SYMBOL_VISIBLE reactor_scheduler_context +{ + /// Scheduler this context belongs to. + reactor_scheduler_base const* key; + + /// Next context frame on this thread's stack. + reactor_scheduler_context* next; + + /// Private work queue for reduced contention. + op_queue private_queue; + + /// Unflushed work count for the private queue. + std::int64_t private_outstanding_work; + + /// Remaining inline completions allowed this cycle. + int inline_budget; + + /// Maximum inline budget (adaptive, 2-16). + int inline_budget_max; + + /// True if no other thread absorbed queued work last cycle. + bool unassisted; + + /// Construct a context frame linked to @a n. + reactor_scheduler_context( + reactor_scheduler_base const* k, reactor_scheduler_context* n) + : key(k) + , next(n) + , private_outstanding_work(0) + , inline_budget(0) + , inline_budget_max(2) + , unassisted(false) + { + } +}; + +/// Thread-local context stack for reactor schedulers. +inline thread_local_ptr reactor_context_stack; + +/// Find the context frame for a scheduler on this thread. +inline reactor_scheduler_context* +reactor_find_context(reactor_scheduler_base const* self) noexcept +{ + for (auto* c = reactor_context_stack.get(); c != nullptr; c = c->next) + { + if (c->key == self) + return c; + } + return nullptr; +} + +/// Flush private work count to global counter. +inline void +reactor_flush_private_work( + reactor_scheduler_context* ctx, + std::atomic& outstanding_work) noexcept +{ + if (ctx && ctx->private_outstanding_work > 0) + { + outstanding_work.fetch_add( + ctx->private_outstanding_work, std::memory_order_relaxed); + ctx->private_outstanding_work = 0; + } +} + +/** Drain private queue to global queue, flushing work count first. + + @return True if any ops were drained. +*/ +inline bool +reactor_drain_private_queue( + reactor_scheduler_context* ctx, + std::atomic& outstanding_work, + op_queue& completed_ops) noexcept +{ + if (!ctx || ctx->private_queue.empty()) + return false; + + reactor_flush_private_work(ctx, outstanding_work); + completed_ops.splice(ctx->private_queue); + return true; +} + +/** Non-template base for reactor-backed scheduler implementations. + + Provides the complete threading model shared by epoll, kqueue, + and select schedulers: signal state machine, inline completion + budget, work counting, run/poll methods, and the do_one event + loop. + + Derived classes provide platform-specific hooks by overriding: + - `run_task(lock, ctx)` to run the reactor poll + - `interrupt_reactor()` to wake a blocked reactor + + De-templated from the original CRTP design to eliminate + duplicate instantiations when multiple backends are compiled + into the same binary. Virtual dispatch for run_task (called + once per reactor cycle, before a blocking syscall) has + negligible overhead. + + @par Thread Safety + All public member functions are thread-safe. +*/ +class reactor_scheduler_base + : public native_scheduler + , public capy::execution_context::service +{ +public: + using key_type = scheduler; + using context_type = reactor_scheduler_context; + + /// Post a coroutine for deferred execution. + void post(std::coroutine_handle<> h) const override; + + /// Post a scheduler operation for deferred execution. + void post(scheduler_op* h) const override; + + /// Return true if called from a thread running this scheduler. + bool running_in_this_thread() const noexcept override; + + /// Request the scheduler to stop dispatching handlers. + void stop() override; + + /// Return true if the scheduler has been stopped. + bool stopped() const noexcept override; + + /// Reset the stopped state so `run()` can resume. + void restart() override; + + /// Run the event loop until no work remains. + std::size_t run() override; + + /// Run until one handler completes or no work remains. + std::size_t run_one() override; + + /// Run until one handler completes or @a usec elapses. + std::size_t wait_one(long usec) override; + + /// Run ready handlers without blocking. + std::size_t poll() override; + + /// Run at most one ready handler without blocking. + std::size_t poll_one() override; + + /// Increment the outstanding work count. + void work_started() noexcept override; + + /// Decrement the outstanding work count, stopping on zero. + void work_finished() noexcept override; + + /** Reset the thread's inline completion budget. + + Called at the start of each posted completion handler to + grant a fresh budget for speculative inline completions. + */ + void reset_inline_budget() const noexcept; + + /** Consume one unit of inline budget if available. + + @return True if budget was available and consumed. + */ + bool try_consume_inline_budget() const noexcept; + + /** Offset a forthcoming work_finished from work_cleanup. + + Called by descriptor_state when all I/O returned EAGAIN and + no handler will be executed. Must be called from a scheduler + thread. + */ + void compensating_work_started() const noexcept; + + /** Drain work from thread context's private queue to global queue. + + Flushes private work count to the global counter, then + transfers the queue under mutex protection. + + @param queue The private queue to drain. + @param count Private work count to flush before draining. + */ + void drain_thread_queue(op_queue& queue, std::int64_t count) const; + + /** Post completed operations for deferred invocation. + + If called from a thread running this scheduler, operations + go to the thread's private queue (fast path). Otherwise, + operations are added to the global queue under mutex and a + waiter is signaled. + + @par Preconditions + work_started() must have been called for each operation. + + @param ops Queue of operations to post. + */ + void post_deferred_completions(op_queue& ops) const; + +protected: + reactor_scheduler_base() = default; + + /** Drain completed_ops during shutdown. + + Pops all operations from the global queue and destroys them, + skipping the task sentinel. Signals all waiting threads. + Derived classes call this from their shutdown() override + before performing platform-specific cleanup. + */ + void shutdown_drain(); + + /// RAII guard that re-inserts the task sentinel after `run_task`. + struct task_cleanup + { + reactor_scheduler_base const* sched; + std::unique_lock* lock; + context_type* ctx; + ~task_cleanup(); + }; + + mutable std::mutex mutex_; + mutable std::condition_variable cond_; + mutable op_queue completed_ops_; + mutable std::atomic outstanding_work_{0}; + bool stopped_ = false; + mutable std::atomic task_running_{false}; + mutable bool task_interrupted_ = false; + + /// Bit 0 of `state_`: set when the condvar should be signaled. + static constexpr std::size_t signaled_bit = 1; + + /// Increment per waiting thread in `state_`. + static constexpr std::size_t waiter_increment = 2; + mutable std::size_t state_ = 0; + + /// Sentinel op that triggers a reactor poll when dequeued. + struct task_op final : scheduler_op + { + void operator()() override {} + void destroy() override {} + }; + task_op task_op_; + + /// Run the platform-specific reactor poll. + virtual void + run_task(std::unique_lock& lock, context_type* ctx) = 0; + + /// Wake a blocked reactor (e.g. write to eventfd or pipe). + virtual void interrupt_reactor() const = 0; + +private: + struct work_cleanup + { + reactor_scheduler_base* sched; + std::unique_lock* lock; + context_type* ctx; + ~work_cleanup(); + }; + + std::size_t do_one( + std::unique_lock& lock, long timeout_us, context_type* ctx); + + void signal_all(std::unique_lock& lock) const; + bool maybe_unlock_and_signal_one(std::unique_lock& lock) const; + bool unlock_and_signal_one(std::unique_lock& lock) const; + void clear_signal() const; + void wait_for_signal(std::unique_lock& lock) const; + void wait_for_signal_for( + std::unique_lock& lock, long timeout_us) const; + void wake_one_thread_and_unlock(std::unique_lock& lock) const; +}; + +/** RAII guard that pushes/pops a scheduler context frame. + + On construction, pushes a new context frame onto the + thread-local stack. On destruction, drains any remaining + private queue items to the global queue and pops the frame. +*/ +struct reactor_thread_context_guard +{ + /// The context frame managed by this guard. + reactor_scheduler_context frame_; + + /// Construct the guard, pushing a frame for @a sched. + explicit reactor_thread_context_guard( + reactor_scheduler_base const* sched) noexcept + : frame_(sched, reactor_context_stack.get()) + { + reactor_context_stack.set(&frame_); + } + + /// Destroy the guard, draining private work and popping the frame. + ~reactor_thread_context_guard() noexcept + { + if (!frame_.private_queue.empty()) + frame_.key->drain_thread_queue( + frame_.private_queue, frame_.private_outstanding_work); + reactor_context_stack.set(frame_.next); + } +}; + +// ---- Inline implementations ------------------------------------------------ + +inline void +reactor_scheduler_base::reset_inline_budget() const noexcept +{ + if (auto* ctx = reactor_find_context(this)) + { + // Cap when no other thread absorbed queued work + if (ctx->unassisted) + { + ctx->inline_budget_max = 4; + ctx->inline_budget = 4; + return; + } + // Ramp up when previous cycle fully consumed budget + if (ctx->inline_budget == 0) + ctx->inline_budget_max = (std::min)(ctx->inline_budget_max * 2, 16); + else if (ctx->inline_budget < ctx->inline_budget_max) + ctx->inline_budget_max = 2; + ctx->inline_budget = ctx->inline_budget_max; + } +} + +inline bool +reactor_scheduler_base::try_consume_inline_budget() const noexcept +{ + if (auto* ctx = reactor_find_context(this)) + { + if (ctx->inline_budget > 0) + { + --ctx->inline_budget; + return true; + } + } + return false; +} + +inline void +reactor_scheduler_base::post(std::coroutine_handle<> h) const +{ + struct post_handler final : scheduler_op + { + std::coroutine_handle<> h_; + + explicit post_handler(std::coroutine_handle<> h) : h_(h) {} + ~post_handler() override = default; + + void operator()() override + { + auto saved = h_; + delete this; + // Ensure stores from the posting thread are visible + std::atomic_thread_fence(std::memory_order_acquire); + saved.resume(); + } + + void destroy() override + { + auto saved = h_; + delete this; + saved.destroy(); + } + }; + + auto ph = std::make_unique(h); + + if (auto* ctx = reactor_find_context(this)) + { + ++ctx->private_outstanding_work; + ctx->private_queue.push(ph.release()); + return; + } + + outstanding_work_.fetch_add(1, std::memory_order_relaxed); + + std::unique_lock lock(mutex_); + completed_ops_.push(ph.release()); + wake_one_thread_and_unlock(lock); +} + +inline void +reactor_scheduler_base::post(scheduler_op* h) const +{ + if (auto* ctx = reactor_find_context(this)) + { + ++ctx->private_outstanding_work; + ctx->private_queue.push(h); + return; + } + + outstanding_work_.fetch_add(1, std::memory_order_relaxed); + + std::unique_lock lock(mutex_); + completed_ops_.push(h); + wake_one_thread_and_unlock(lock); +} + +inline bool +reactor_scheduler_base::running_in_this_thread() const noexcept +{ + return reactor_find_context(this) != nullptr; +} + +inline void +reactor_scheduler_base::stop() +{ + std::unique_lock lock(mutex_); + if (!stopped_) + { + stopped_ = true; + signal_all(lock); + interrupt_reactor(); + } +} + +inline bool +reactor_scheduler_base::stopped() const noexcept +{ + std::unique_lock lock(mutex_); + return stopped_; +} + +inline void +reactor_scheduler_base::restart() +{ + std::unique_lock lock(mutex_); + stopped_ = false; +} + +inline std::size_t +reactor_scheduler_base::run() +{ + if (outstanding_work_.load(std::memory_order_acquire) == 0) + { + stop(); + return 0; + } + + reactor_thread_context_guard ctx(this); + std::unique_lock lock(mutex_); + + std::size_t n = 0; + for (;;) + { + if (!do_one(lock, -1, &ctx.frame_)) + break; + if (n != (std::numeric_limits::max)()) + ++n; + if (!lock.owns_lock()) + lock.lock(); + } + return n; +} + +inline std::size_t +reactor_scheduler_base::run_one() +{ + if (outstanding_work_.load(std::memory_order_acquire) == 0) + { + stop(); + return 0; + } + + reactor_thread_context_guard ctx(this); + std::unique_lock lock(mutex_); + return do_one(lock, -1, &ctx.frame_); +} + +inline std::size_t +reactor_scheduler_base::wait_one(long usec) +{ + if (outstanding_work_.load(std::memory_order_acquire) == 0) + { + stop(); + return 0; + } + + reactor_thread_context_guard ctx(this); + std::unique_lock lock(mutex_); + return do_one(lock, usec, &ctx.frame_); +} + +inline std::size_t +reactor_scheduler_base::poll() +{ + if (outstanding_work_.load(std::memory_order_acquire) == 0) + { + stop(); + return 0; + } + + reactor_thread_context_guard ctx(this); + std::unique_lock lock(mutex_); + + std::size_t n = 0; + for (;;) + { + if (!do_one(lock, 0, &ctx.frame_)) + break; + if (n != (std::numeric_limits::max)()) + ++n; + if (!lock.owns_lock()) + lock.lock(); + } + return n; +} + +inline std::size_t +reactor_scheduler_base::poll_one() +{ + if (outstanding_work_.load(std::memory_order_acquire) == 0) + { + stop(); + return 0; + } + + reactor_thread_context_guard ctx(this); + std::unique_lock lock(mutex_); + return do_one(lock, 0, &ctx.frame_); +} + +inline void +reactor_scheduler_base::work_started() noexcept +{ + outstanding_work_.fetch_add(1, std::memory_order_relaxed); +} + +inline void +reactor_scheduler_base::work_finished() noexcept +{ + if (outstanding_work_.fetch_sub(1, std::memory_order_acq_rel) == 1) + stop(); +} + +inline void +reactor_scheduler_base::compensating_work_started() const noexcept +{ + auto* ctx = reactor_find_context(this); + if (ctx) + ++ctx->private_outstanding_work; +} + +inline void +reactor_scheduler_base::drain_thread_queue( + op_queue& queue, std::int64_t count) const +{ + if (count > 0) + outstanding_work_.fetch_add(count, std::memory_order_relaxed); + + std::unique_lock lock(mutex_); + completed_ops_.splice(queue); + if (count > 0) + maybe_unlock_and_signal_one(lock); +} + +inline void +reactor_scheduler_base::post_deferred_completions(op_queue& ops) const +{ + if (ops.empty()) + return; + + if (auto* ctx = reactor_find_context(this)) + { + ctx->private_queue.splice(ops); + return; + } + + std::unique_lock lock(mutex_); + completed_ops_.splice(ops); + wake_one_thread_and_unlock(lock); +} + +inline void +reactor_scheduler_base::shutdown_drain() +{ + std::unique_lock lock(mutex_); + + while (auto* h = completed_ops_.pop()) + { + if (h == &task_op_) + continue; + lock.unlock(); + h->destroy(); + lock.lock(); + } + + signal_all(lock); +} + +inline void +reactor_scheduler_base::signal_all(std::unique_lock&) const +{ + state_ |= signaled_bit; + cond_.notify_all(); +} + +inline bool +reactor_scheduler_base::maybe_unlock_and_signal_one( + std::unique_lock& lock) const +{ + state_ |= signaled_bit; + if (state_ > signaled_bit) + { + lock.unlock(); + cond_.notify_one(); + return true; + } + return false; +} + +inline bool +reactor_scheduler_base::unlock_and_signal_one( + std::unique_lock& lock) const +{ + state_ |= signaled_bit; + bool have_waiters = state_ > signaled_bit; + lock.unlock(); + if (have_waiters) + cond_.notify_one(); + return have_waiters; +} + +inline void +reactor_scheduler_base::clear_signal() const +{ + state_ &= ~signaled_bit; +} + +inline void +reactor_scheduler_base::wait_for_signal( + std::unique_lock& lock) const +{ + while ((state_ & signaled_bit) == 0) + { + state_ += waiter_increment; + cond_.wait(lock); + state_ -= waiter_increment; + } +} + +inline void +reactor_scheduler_base::wait_for_signal_for( + std::unique_lock& lock, long timeout_us) const +{ + if ((state_ & signaled_bit) == 0) + { + state_ += waiter_increment; + cond_.wait_for(lock, std::chrono::microseconds(timeout_us)); + state_ -= waiter_increment; + } +} + +inline void +reactor_scheduler_base::wake_one_thread_and_unlock( + std::unique_lock& lock) const +{ + if (maybe_unlock_and_signal_one(lock)) + return; + + if (task_running_.load(std::memory_order_relaxed) && !task_interrupted_) + { + task_interrupted_ = true; + lock.unlock(); + interrupt_reactor(); + } + else + { + lock.unlock(); + } +} + +inline reactor_scheduler_base::work_cleanup::~work_cleanup() +{ + if (ctx) + { + std::int64_t produced = ctx->private_outstanding_work; + if (produced > 1) + sched->outstanding_work_.fetch_add( + produced - 1, std::memory_order_relaxed); + else if (produced < 1) + sched->work_finished(); + ctx->private_outstanding_work = 0; + + if (!ctx->private_queue.empty()) + { + lock->lock(); + sched->completed_ops_.splice(ctx->private_queue); + } + } + else + { + sched->work_finished(); + } +} + +inline reactor_scheduler_base::task_cleanup::~task_cleanup() +{ + if (!ctx) + return; + + if (ctx->private_outstanding_work > 0) + { + sched->outstanding_work_.fetch_add( + ctx->private_outstanding_work, std::memory_order_relaxed); + ctx->private_outstanding_work = 0; + } + + if (!ctx->private_queue.empty()) + { + if (!lock->owns_lock()) + lock->lock(); + sched->completed_ops_.splice(ctx->private_queue); + } +} + +inline std::size_t +reactor_scheduler_base::do_one( + std::unique_lock& lock, long timeout_us, context_type* ctx) +{ + for (;;) + { + if (stopped_) + return 0; + + scheduler_op* op = completed_ops_.pop(); + + // Handle reactor sentinel — time to poll for I/O + if (op == &task_op_) + { + bool more_handlers = + !completed_ops_.empty() || (ctx && !ctx->private_queue.empty()); + + if (!more_handlers && + (outstanding_work_.load(std::memory_order_acquire) == 0 || + timeout_us == 0)) + { + completed_ops_.push(&task_op_); + return 0; + } + + task_interrupted_ = more_handlers || timeout_us == 0; + task_running_.store(true, std::memory_order_release); + + if (more_handlers) + unlock_and_signal_one(lock); + + try + { + run_task(lock, ctx); + } + catch (...) + { + task_running_.store(false, std::memory_order_relaxed); + throw; + } + + task_running_.store(false, std::memory_order_relaxed); + completed_ops_.push(&task_op_); + continue; + } + + // Handle operation + if (op != nullptr) + { + bool more = !completed_ops_.empty(); + + if (more) + ctx->unassisted = !unlock_and_signal_one(lock); + else + { + ctx->unassisted = false; + lock.unlock(); + } + + work_cleanup on_exit{this, &lock, ctx}; + (void)on_exit; + + (*op)(); + return 1; + } + + // Try private queue before blocking + if (reactor_drain_private_queue(ctx, outstanding_work_, completed_ops_)) + continue; + + if (outstanding_work_.load(std::memory_order_acquire) == 0 || + timeout_us == 0) + return 0; + + clear_signal(); + if (timeout_us < 0) + wait_for_signal(lock); + else + wait_for_signal_for(lock, timeout_us); + } +} + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SCHEDULER_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_service_state.hpp b/include/boost/corosio/native/detail/reactor/reactor_service_state.hpp new file mode 100644 index 00000000..c72b857a --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_service_state.hpp @@ -0,0 +1,53 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SERVICE_STATE_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SERVICE_STATE_HPP + +#include + +#include +#include +#include + +namespace boost::corosio::detail { + +/** Shared service state for reactor backends. + + Holds the scheduler reference, service mutex, and per-impl + ownership tracking. Used by both socket and acceptor services. + + @tparam Scheduler The backend's scheduler type. + @tparam Impl The backend's socket or acceptor impl type. +*/ +template +struct reactor_service_state +{ + /// Construct with a reference to the owning scheduler. + explicit reactor_service_state(Scheduler& sched) noexcept + : sched_(sched) + { + } + + /// Reference to the owning scheduler. + Scheduler& sched_; + + /// Protects `impl_list_` and `impl_ptrs_`. + std::mutex mutex_; + + /// All live impl objects for shutdown traversal. + intrusive_list impl_list_; + + /// Shared ownership of each impl, keyed by raw pointer. + std::unordered_map> impl_ptrs_; +}; + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SERVICE_STATE_HPP diff --git a/include/boost/corosio/native/detail/reactor/reactor_socket.hpp b/include/boost/corosio/native/detail/reactor/reactor_socket.hpp new file mode 100644 index 00000000..1e947021 --- /dev/null +++ b/include/boost/corosio/native/detail/reactor/reactor_socket.hpp @@ -0,0 +1,725 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/corosio +// + +#ifndef BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SOCKET_HPP +#define BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SOCKET_HPP + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace boost::corosio::detail { + +/** CRTP base for reactor-backed socket implementations. + + Provides shared data members, trivial virtual overrides, + non-virtual helper methods for cancellation, registration, + close, and the full I/O dispatch logic (`do_connect`, + `do_read_some`, `do_write_some`). Concrete backends inherit + and add `cancel()`, `close_socket()`, and I/O overrides that + delegate to the `do_*` helpers. + + @tparam Derived The concrete socket type (CRTP). + @tparam Service The backend's socket service type. + @tparam Op The backend's base op type. + @tparam ConnOp The backend's connect op type. + @tparam ReadOp The backend's read op type. + @tparam WriteOp The backend's write op type. + @tparam DescState The backend's descriptor_state type. +*/ +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +class reactor_socket + : public tcp_socket::implementation + , public std::enable_shared_from_this + , public intrusive_list::node +{ + friend Derived; + + explicit reactor_socket(Service& svc) noexcept : svc_(svc) {} + +protected: + Service& svc_; + int fd_ = -1; + endpoint local_endpoint_; + endpoint remote_endpoint_; + +public: + /// Pending connect operation slot. + ConnOp conn_; + + /// Pending read operation slot. + ReadOp rd_; + + /// Pending write operation slot. + WriteOp wr_; + + /// Per-descriptor state for persistent reactor registration. + DescState desc_state_; + + ~reactor_socket() override = default; + + /// Return the underlying file descriptor. + native_handle_type native_handle() const noexcept override + { + return fd_; + } + + /// Return the cached local endpoint. + endpoint local_endpoint() const noexcept override + { + return local_endpoint_; + } + + /// Return the cached remote endpoint. + endpoint remote_endpoint() const noexcept override + { + return remote_endpoint_; + } + + /// Return true if the socket has an open file descriptor. + bool is_open() const noexcept + { + return fd_ >= 0; + } + + /// Shut down part or all of the full-duplex connection. + std::error_code shutdown(tcp_socket::shutdown_type what) noexcept override + { + int how; + switch (what) + { + case tcp_socket::shutdown_receive: + how = SHUT_RD; + break; + case tcp_socket::shutdown_send: + how = SHUT_WR; + break; + case tcp_socket::shutdown_both: + how = SHUT_RDWR; + break; + default: + return make_err(EINVAL); + } + if (::shutdown(fd_, how) != 0) + return make_err(errno); + return {}; + } + + /// Set a socket option. + std::error_code set_option( + int level, + int optname, + void const* data, + std::size_t size) noexcept override + { + if (::setsockopt( + fd_, level, optname, data, static_cast(size)) != 0) + return make_err(errno); + return {}; + } + + /// Get a socket option. + std::error_code + get_option(int level, int optname, void* data, std::size_t* size) + const noexcept override + { + socklen_t len = static_cast(*size); + if (::getsockopt(fd_, level, optname, data, &len) != 0) + return make_err(errno); + *size = static_cast(len); + return {}; + } + + /// Assign the file descriptor. + void set_socket(int fd) noexcept + { + fd_ = fd; + } + + /// Cache local and remote endpoints. + void set_endpoints(endpoint local, endpoint remote) noexcept + { + local_endpoint_ = local; + remote_endpoint_ = remote; + } + + /** Register an op with the reactor. + + Handles cached edge events and deferred cancellation. + Called on the EAGAIN/EINPROGRESS path when speculative + I/O failed. + */ + void register_op( + Op& op, + reactor_op_base*& desc_slot, + bool& ready_flag, + bool& cancel_flag) noexcept; + + /** Cancel a single pending operation. + + Claims the operation from its descriptor_state slot under + the mutex and posts it to the scheduler as cancelled. + + @param op The operation to cancel. + */ + void cancel_single_op(Op& op) noexcept; + + /** Cancel all pending operations. + + Invoked by the derived class's cancel() override. + */ + void do_cancel() noexcept; + + /** Close the socket and cancel pending operations. + + Invoked by the derived class's close_socket(). The + derived class may add backend-specific cleanup after + calling this method. + */ + void do_close_socket() noexcept; + + /** Shared connect dispatch. + + Tries the connect syscall speculatively. On synchronous + completion, returns via inline budget or posts through queue. + On EINPROGRESS, registers with the reactor. + */ + std::coroutine_handle<> do_connect( + std::coroutine_handle<>, + capy::executor_ref, + endpoint, + std::stop_token const&, + std::error_code*); + + /** Shared scatter-read dispatch. + + Tries readv() speculatively. On success or hard error, + returns via inline budget or posts through queue. + On EAGAIN, registers with the reactor. + */ + std::coroutine_handle<> do_read_some( + std::coroutine_handle<>, + capy::executor_ref, + buffer_param, + std::stop_token const&, + std::error_code*, + std::size_t*); + + /** Shared gather-write dispatch. + + Tries the write via WriteOp::write_policy speculatively. + On success or hard error, returns via inline budget or + posts through queue. On EAGAIN, registers with the reactor. + */ + std::coroutine_handle<> do_write_some( + std::coroutine_handle<>, + capy::executor_ref, + buffer_param, + std::stop_token const&, + std::error_code*, + std::size_t*); +}; + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +void +reactor_socket:: + register_op( + Op& op, + reactor_op_base*& desc_slot, + bool& ready_flag, + bool& cancel_flag) noexcept +{ + svc_.work_started(); + + std::lock_guard lock(desc_state_.mutex); + bool io_done = false; + if (ready_flag) + { + ready_flag = false; + op.perform_io(); + io_done = (op.errn != EAGAIN && op.errn != EWOULDBLOCK); + if (!io_done) + op.errn = 0; + } + + if (cancel_flag) + { + cancel_flag = false; + op.cancelled.store(true, std::memory_order_relaxed); + } + + if (io_done || op.cancelled.load(std::memory_order_acquire)) + { + svc_.post(&op); + svc_.work_finished(); + } + else + { + desc_slot = &op; + } +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +void +reactor_socket:: + cancel_single_op(Op& op) noexcept +{ + auto self = this->weak_from_this().lock(); + if (!self) + return; + + op.request_cancel(); + + reactor_op_base** desc_op_ptr = nullptr; + if (&op == &conn_) + desc_op_ptr = &desc_state_.connect_op; + else if (&op == &rd_) + desc_op_ptr = &desc_state_.read_op; + else if (&op == &wr_) + desc_op_ptr = &desc_state_.write_op; + + if (desc_op_ptr) + { + reactor_op_base* claimed = nullptr; + { + std::lock_guard lock(desc_state_.mutex); + if (*desc_op_ptr == &op) + claimed = std::exchange(*desc_op_ptr, nullptr); + else if (&op == &conn_) + desc_state_.connect_cancel_pending = true; + else if (&op == &rd_) + desc_state_.read_cancel_pending = true; + else if (&op == &wr_) + desc_state_.write_cancel_pending = true; + } + if (claimed) + { + op.impl_ptr = self; + svc_.post(&op); + svc_.work_finished(); + } + } +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +void +reactor_socket:: + do_cancel() noexcept +{ + auto self = this->weak_from_this().lock(); + if (!self) + return; + + conn_.request_cancel(); + rd_.request_cancel(); + wr_.request_cancel(); + + reactor_op_base* conn_claimed = nullptr; + reactor_op_base* rd_claimed = nullptr; + reactor_op_base* wr_claimed = nullptr; + { + std::lock_guard lock(desc_state_.mutex); + if (desc_state_.connect_op == &conn_) + conn_claimed = std::exchange(desc_state_.connect_op, nullptr); + if (desc_state_.read_op == &rd_) + rd_claimed = std::exchange(desc_state_.read_op, nullptr); + if (desc_state_.write_op == &wr_) + wr_claimed = std::exchange(desc_state_.write_op, nullptr); + } + + if (conn_claimed) + { + conn_.impl_ptr = self; + svc_.post(&conn_); + svc_.work_finished(); + } + if (rd_claimed) + { + rd_.impl_ptr = self; + svc_.post(&rd_); + svc_.work_finished(); + } + if (wr_claimed) + { + wr_.impl_ptr = self; + svc_.post(&wr_); + svc_.work_finished(); + } +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +void +reactor_socket:: + do_close_socket() noexcept +{ + auto self = this->weak_from_this().lock(); + if (self) + { + conn_.request_cancel(); + rd_.request_cancel(); + wr_.request_cancel(); + + reactor_op_base* conn_claimed = nullptr; + reactor_op_base* rd_claimed = nullptr; + reactor_op_base* wr_claimed = nullptr; + { + std::lock_guard lock(desc_state_.mutex); + conn_claimed = std::exchange(desc_state_.connect_op, nullptr); + rd_claimed = std::exchange(desc_state_.read_op, nullptr); + wr_claimed = std::exchange(desc_state_.write_op, nullptr); + desc_state_.read_ready = false; + desc_state_.write_ready = false; + desc_state_.read_cancel_pending = false; + desc_state_.write_cancel_pending = false; + desc_state_.connect_cancel_pending = false; + + // Keep impl alive while descriptor_state is queued in the + // scheduler. Must be under mutex to avoid racing with + // invoke_deferred_io()'s move of impl_ref_. + if (desc_state_.is_enqueued_.load(std::memory_order_acquire)) + desc_state_.impl_ref_ = self; + } + + if (conn_claimed) + { + conn_.impl_ptr = self; + svc_.post(&conn_); + svc_.work_finished(); + } + if (rd_claimed) + { + rd_.impl_ptr = self; + svc_.post(&rd_); + svc_.work_finished(); + } + if (wr_claimed) + { + wr_.impl_ptr = self; + svc_.post(&wr_); + svc_.work_finished(); + } + } + + if (fd_ >= 0) + { + if (desc_state_.registered_events != 0) + svc_.scheduler().deregister_descriptor(fd_); + ::close(fd_); + fd_ = -1; + } + + desc_state_.fd = -1; + desc_state_.registered_events = 0; + + local_endpoint_ = endpoint{}; + remote_endpoint_ = endpoint{}; +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +std::coroutine_handle<> +reactor_socket:: + do_connect( + std::coroutine_handle<> h, + capy::executor_ref ex, + endpoint ep, + std::stop_token const& token, + std::error_code* ec) +{ + auto& op = conn_; + + sockaddr_storage storage{}; + socklen_t addrlen = to_sockaddr(ep, socket_family(fd_), storage); + int result = ::connect(fd_, reinterpret_cast(&storage), addrlen); + + if (result == 0) + { + sockaddr_storage local_storage{}; + socklen_t local_len = sizeof(local_storage); + if (::getsockname( + fd_, reinterpret_cast(&local_storage), &local_len) == + 0) + local_endpoint_ = from_sockaddr(local_storage); + remote_endpoint_ = ep; + } + + if (result == 0 || errno != EINPROGRESS) + { + int err = (result < 0) ? errno : 0; + if (svc_.scheduler().try_consume_inline_budget()) + { + *ec = err ? make_err(err) : std::error_code{}; + return dispatch_coro(ex, h); + } + op.reset(); + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.fd = fd_; + op.target_endpoint = ep; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + op.complete(err, 0); + svc_.post(&op); + return std::noop_coroutine(); + } + + // EINPROGRESS — register with reactor + op.reset(); + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.fd = fd_; + op.target_endpoint = ep; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + + register_op( + op, desc_state_.connect_op, desc_state_.write_ready, + desc_state_.connect_cancel_pending); + return std::noop_coroutine(); +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +std::coroutine_handle<> +reactor_socket:: + do_read_some( + std::coroutine_handle<> h, + capy::executor_ref ex, + buffer_param param, + std::stop_token const& token, + std::error_code* ec, + std::size_t* bytes_out) +{ + auto& op = rd_; + op.reset(); + + capy::mutable_buffer bufs[ReadOp::max_buffers]; + op.iovec_count = static_cast(param.copy_to(bufs, ReadOp::max_buffers)); + + if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) + { + op.empty_buffer_read = true; + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + op.complete(0, 0); + svc_.post(&op); + return std::noop_coroutine(); + } + + for (int i = 0; i < op.iovec_count; ++i) + { + op.iovecs[i].iov_base = bufs[i].data(); + op.iovecs[i].iov_len = bufs[i].size(); + } + + // Speculative read + ssize_t n; + do + { + n = ::readv(fd_, op.iovecs, op.iovec_count); + } + while (n < 0 && errno == EINTR); + + if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) + { + int err = (n < 0) ? errno : 0; + auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); + + if (svc_.scheduler().try_consume_inline_budget()) + { + if (err) + *ec = make_err(err); + else if (n == 0) + *ec = capy::error::eof; + else + *ec = {}; + *bytes_out = bytes; + return dispatch_coro(ex, h); + } + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + op.complete(err, bytes); + svc_.post(&op); + return std::noop_coroutine(); + } + + // EAGAIN — register with reactor + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.fd = fd_; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + + register_op( + op, desc_state_.read_op, desc_state_.read_ready, + desc_state_.read_cancel_pending); + return std::noop_coroutine(); +} + +template< + class Derived, + class Service, + class Op, + class ConnOp, + class ReadOp, + class WriteOp, + class DescState> +std::coroutine_handle<> +reactor_socket:: + do_write_some( + std::coroutine_handle<> h, + capy::executor_ref ex, + buffer_param param, + std::stop_token const& token, + std::error_code* ec, + std::size_t* bytes_out) +{ + auto& op = wr_; + op.reset(); + + capy::mutable_buffer bufs[WriteOp::max_buffers]; + op.iovec_count = + static_cast(param.copy_to(bufs, WriteOp::max_buffers)); + + if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) + { + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + op.complete(0, 0); + svc_.post(&op); + return std::noop_coroutine(); + } + + for (int i = 0; i < op.iovec_count; ++i) + { + op.iovecs[i].iov_base = bufs[i].data(); + op.iovecs[i].iov_len = bufs[i].size(); + } + + // Speculative write via backend-specific write policy + ssize_t n = WriteOp::write_policy::write(fd_, op.iovecs, op.iovec_count); + + if (n >= 0 || (errno != EAGAIN && errno != EWOULDBLOCK)) + { + int err = (n < 0) ? errno : 0; + auto bytes = (n > 0) ? static_cast(n) : std::size_t(0); + + if (svc_.scheduler().try_consume_inline_budget()) + { + *ec = err ? make_err(err) : std::error_code{}; + *bytes_out = bytes; + return dispatch_coro(ex, h); + } + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + op.complete(err, bytes); + svc_.post(&op); + return std::noop_coroutine(); + } + + // EAGAIN — register with reactor + op.h = h; + op.ex = ex; + op.ec_out = ec; + op.bytes_out = bytes_out; + op.fd = fd_; + op.start(token, static_cast(this)); + op.impl_ptr = this->shared_from_this(); + + register_op( + op, desc_state_.write_op, desc_state_.write_ready, + desc_state_.write_cancel_pending); + return std::noop_coroutine(); +} + +} // namespace boost::corosio::detail + +#endif // BOOST_COROSIO_NATIVE_DETAIL_REACTOR_REACTOR_SOCKET_HPP diff --git a/include/boost/corosio/native/detail/select/select_acceptor.hpp b/include/boost/corosio/native/detail/select/select_acceptor.hpp index c4f74043..400a6f1d 100644 --- a/include/boost/corosio/native/detail/select/select_acceptor.hpp +++ b/include/boost/corosio/native/detail/select/select_acceptor.hpp @@ -14,24 +14,22 @@ #if BOOST_COROSIO_HAS_SELECT -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { class select_acceptor_service; -class select_socket_service; /// Acceptor implementation for select backend. class select_acceptor final - : public tcp_acceptor::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_acceptor< + select_acceptor, + select_acceptor_service, + select_op, + select_accept_op, + select_descriptor_state> { friend class select_acceptor_service; @@ -45,46 +43,8 @@ class select_acceptor final std::error_code*, io_object::implementation**) override; - int native_handle() const noexcept - { - return fd_; - } - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - bool is_open() const noexcept override - { - return fd_ >= 0; - } void cancel() noexcept override; - - std::error_code set_option( - int level, - int optname, - void const* data, - std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - void cancel_single_op(select_op& op) noexcept; void close_socket() noexcept; - void set_local_endpoint(endpoint ep) noexcept - { - local_endpoint_ = ep; - } - - select_acceptor_service& service() noexcept - { - return svc_; - } - - select_accept_op acc_; - -private: - select_acceptor_service& svc_; - int fd_ = -1; - endpoint local_endpoint_; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/select/select_acceptor_service.hpp b/include/boost/corosio/native/detail/select/select_acceptor_service.hpp index 4de3c87e..aff215a3 100644 --- a/include/boost/corosio/native/detail/select/select_acceptor_service.hpp +++ b/include/boost/corosio/native/detail/select/select_acceptor_service.hpp @@ -21,38 +21,26 @@ #include #include #include +#include -#include -#include -#include +#include + +#include +#include +#include #include #include #include +#include #include #include -#include -#include -#include - namespace boost::corosio::detail { -/** State for select acceptor service. */ -class select_acceptor_state -{ -public: - explicit select_acceptor_state(select_scheduler& sched) noexcept - : sched_(sched) - { - } - - select_scheduler& sched_; - std::mutex mutex_; - intrusive_list acceptor_list_; - std::unordered_map> - acceptor_ptrs_; -}; +/// State for select acceptor service. +using select_acceptor_state = + reactor_service_state; /** select acceptor service implementation. @@ -87,7 +75,7 @@ class BOOST_COROSIO_DECL select_acceptor_service final : public acceptor_service { return state_->sched_; } - void post(select_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -111,107 +99,11 @@ select_accept_op::cancel() noexcept inline void select_accept_op::operator()() { - stop_cb.reset(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - } - - if (success && accepted_fd >= 0) - { - if (acceptor_impl_) - { - auto* socket_svc = static_cast(acceptor_impl_) - ->service() - .socket_service(); - if (socket_svc) - { - auto& impl = - static_cast(*socket_svc->construct()); - impl.set_socket(accepted_fd); - - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - sockaddr_storage remote_storage{}; - socklen_t remote_len = sizeof(remote_storage); - - endpoint local_ep, remote_ep; - if (::getsockname( - accepted_fd, - reinterpret_cast(&local_storage), - &local_len) == 0) - local_ep = from_sockaddr(local_storage); - if (::getpeername( - accepted_fd, - reinterpret_cast(&remote_storage), - &remote_len) == 0) - remote_ep = from_sockaddr(remote_storage); - - impl.set_endpoints(local_ep, remote_ep); - - if (impl_out) - *impl_out = &impl; - - accepted_fd = -1; - } - else - { - if (ec_out && !*ec_out) - *ec_out = make_err(ENOENT); - ::close(accepted_fd); - accepted_fd = -1; - if (impl_out) - *impl_out = nullptr; - } - } - else - { - ::close(accepted_fd); - accepted_fd = -1; - if (impl_out) - *impl_out = nullptr; - } - } - else - { - if (accepted_fd >= 0) - { - ::close(accepted_fd); - accepted_fd = -1; - } - - if (peer_impl) - { - auto* socket_svc_cleanup = - static_cast(acceptor_impl_) - ->service() - .socket_service(); - if (socket_svc_cleanup) - socket_svc_cleanup->destroy(peer_impl); - peer_impl = nullptr; - } - - if (impl_out) - *impl_out = nullptr; - } - - // Move to stack before destroying the frame - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - impl_ptr.reset(); - dispatch_coro(saved_ex, saved_h).resume(); + complete_accept_op(*this); } inline select_acceptor::select_acceptor(select_acceptor_service& svc) noexcept - : svc_(svc) + : reactor_acceptor(svc) { } @@ -234,29 +126,30 @@ select_acceptor::accept( sockaddr_storage peer_storage{}; socklen_t addrlen = sizeof(peer_storage); - int accepted = - ::accept(fd_, reinterpret_cast(&peer_storage), &addrlen); + int accepted; + do + { + accepted = + ::accept(fd_, reinterpret_cast(&peer_storage), &addrlen); + } + while (accepted < 0 && errno == EINTR); if (accepted >= 0) { - // Reject fds that exceed select()'s FD_SETSIZE limit. if (accepted >= FD_SETSIZE) { ::close(accepted); - op.accepted_fd = -1; op.complete(EINVAL, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); return std::noop_coroutine(); } - // Set non-blocking and close-on-exec flags. int flags = ::fcntl(accepted, F_GETFL, 0); if (flags == -1) { int err = errno; ::close(accepted); - op.accepted_fd = -1; op.complete(err, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); @@ -267,7 +160,6 @@ select_acceptor::accept( { int err = errno; ::close(accepted); - op.accepted_fd = -1; op.complete(err, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); @@ -278,14 +170,55 @@ select_acceptor::accept( { int err = errno; ::close(accepted); - op.accepted_fd = -1; op.complete(err, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); return std::noop_coroutine(); } - op.accepted_fd = accepted; + { + std::lock_guard lock(desc_state_.mutex); + desc_state_.read_ready = false; + } + + if (svc_.scheduler().try_consume_inline_budget()) + { + auto* socket_svc = svc_.socket_service(); + if (socket_svc) + { + auto& impl = + static_cast(*socket_svc->construct()); + impl.set_socket(accepted); + + impl.desc_state_.fd = accepted; + { + std::lock_guard lock(impl.desc_state_.mutex); + impl.desc_state_.read_op = nullptr; + impl.desc_state_.write_op = nullptr; + impl.desc_state_.connect_op = nullptr; + } + socket_svc->scheduler().register_descriptor( + accepted, &impl.desc_state_); + + impl.set_endpoints( + local_endpoint_, from_sockaddr(peer_storage)); + + *ec = {}; + if (impl_out) + *impl_out = &impl; + } + else + { + ::close(accepted); + *ec = make_err(ENOENT); + if (impl_out) + *impl_out = nullptr; + } + return dispatch_coro(ex, h); + } + + op.accepted_fd = accepted; + op.peer_storage = peer_storage; op.complete(0, 0); op.impl_ptr = shared_from_this(); svc_.post(&op); @@ -294,42 +227,28 @@ select_acceptor::accept( if (errno == EAGAIN || errno == EWOULDBLOCK) { - svc_.work_started(); op.impl_ptr = shared_from_this(); + svc_.work_started(); - // Set registering BEFORE register_fd to close the race window where - // reactor sees an event before we set registered. - op.registered.store( - select_registration_state::registering, std::memory_order_release); - svc_.scheduler().register_fd(fd_, &op, select_scheduler::event_read); - - // Transition to registered. If this fails, reactor or cancel already - // claimed the op (state is now unregistered), so we're done. However, - // we must still deregister the fd because cancel's deregister_fd may - // have run before our register_fd, leaving the fd orphaned. - auto expected = select_registration_state::registering; - if (!op.registered.compare_exchange_strong( - expected, select_registration_state::registered, - std::memory_order_acq_rel)) + std::lock_guard lock(desc_state_.mutex); + bool io_done = false; + if (desc_state_.read_ready) { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - return std::noop_coroutine(); + desc_state_.read_ready = false; + op.perform_io(); + io_done = (op.errn != EAGAIN && op.errn != EWOULDBLOCK); + if (!io_done) + op.errn = 0; } - // If cancelled was set before we registered, handle it now. - if (op.cancelled.load(std::memory_order_acquire)) + if (io_done || op.cancelled.load(std::memory_order_acquire)) { - auto prev = op.registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd( - fd_, select_scheduler::event_read); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - svc_.work_finished(); - } + svc_.post(&op); + svc_.work_finished(); + } + else + { + desc_state_.read_op = &op; } return std::noop_coroutine(); } @@ -343,71 +262,13 @@ select_acceptor::accept( inline void select_acceptor::cancel() noexcept { - auto self = weak_from_this().lock(); - if (!self) - return; - - auto prev = acc_.registered.exchange( - select_registration_state::unregistered, std::memory_order_acq_rel); - acc_.request_cancel(); - - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - acc_.impl_ptr = self; - svc_.post(&acc_); - svc_.work_finished(); - } -} - -inline void -select_acceptor::cancel_single_op(select_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - auto prev = op.registered.exchange( - select_registration_state::unregistered, std::memory_order_acq_rel); - op.request_cancel(); - - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } + do_cancel(); } inline void select_acceptor::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - auto prev = acc_.registered.exchange( - select_registration_state::unregistered, std::memory_order_acq_rel); - acc_.request_cancel(); - - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - acc_.impl_ptr = self; - svc_.post(&acc_); - svc_.work_finished(); - } - } - - if (fd_ >= 0) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - ::close(fd_); - fd_ = -1; - } - - local_endpoint_ = endpoint{}; + do_close_socket(); } inline select_acceptor_service::select_acceptor_service( @@ -426,10 +287,10 @@ select_acceptor_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->acceptor_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) impl->close_socket(); - // Don't clear acceptor_ptrs_ here — same rationale as + // Don't clear impl_ptrs_ here — same rationale as // select_socket_service::shutdown(). Let ~state_ release ptrs // after scheduler shutdown has drained all queued ops. } @@ -441,8 +302,8 @@ select_acceptor_service::construct() auto* raw = impl.get(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.push_back(raw); - state_->acceptor_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); return raw; } @@ -453,8 +314,8 @@ select_acceptor_service::destroy(io_object::implementation* impl) auto* select_impl = static_cast(impl); select_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->acceptor_list_.remove(select_impl); - state_->acceptor_ptrs_.erase(select_impl); + state_->impl_list_.remove(select_impl); + state_->impl_ptrs_.erase(select_impl); } inline void @@ -463,27 +324,6 @@ select_acceptor_service::close(io_object::handle& h) static_cast(h.get())->close_socket(); } -inline std::error_code -select_acceptor::set_option( - int level, int optname, void const* data, std::size_t size) noexcept -{ - if (::setsockopt(fd_, level, optname, data, static_cast(size)) != - 0) - return make_err(errno); - return {}; -} - -inline std::error_code -select_acceptor::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; -} - inline std::error_code select_acceptor_service::open_acceptor_socket( tcp_acceptor::implementation& impl, int family, int type, int protocol) @@ -495,7 +335,6 @@ select_acceptor_service::open_acceptor_socket( if (fd < 0) return make_err(errno); - // Set non-blocking and close-on-exec int flags = ::fcntl(fd, F_GETFL, 0); if (flags == -1) { @@ -528,7 +367,23 @@ select_acceptor_service::open_acceptor_socket( ::setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof(val)); } +#ifdef SO_NOSIGPIPE + { + int nosig = 1; + ::setsockopt(fd, SOL_SOCKET, SO_NOSIGPIPE, &nosig, sizeof(nosig)); + } +#endif + select_impl->fd_ = fd; + + // Set up descriptor state but do NOT register with reactor yet + // (registration happens in do_listen via reactor_acceptor base) + select_impl->desc_state_.fd = fd; + { + std::lock_guard lock(select_impl->desc_state_.mutex); + select_impl->desc_state_.read_op = nullptr; + } + return {}; } @@ -536,38 +391,18 @@ inline std::error_code select_acceptor_service::bind_acceptor( tcp_acceptor::implementation& impl, endpoint ep) { - auto* select_impl = static_cast(&impl); - int fd = select_impl->fd_; - - sockaddr_storage storage{}; - socklen_t addrlen = detail::to_sockaddr(ep, storage); - if (::bind(fd, reinterpret_cast(&storage), addrlen) < 0) - return make_err(errno); - - // Cache local endpoint (resolves ephemeral port) - sockaddr_storage local{}; - socklen_t local_len = sizeof(local); - if (::getsockname(fd, reinterpret_cast(&local), &local_len) == 0) - select_impl->set_local_endpoint(detail::from_sockaddr(local)); - - return {}; + return static_cast(&impl)->do_bind(ep); } inline std::error_code select_acceptor_service::listen_acceptor( tcp_acceptor::implementation& impl, int backlog) { - auto* select_impl = static_cast(&impl); - int fd = select_impl->fd_; - - if (::listen(fd, backlog) < 0) - return make_err(errno); - - return {}; + return static_cast(&impl)->do_listen(backlog); } inline void -select_acceptor_service::post(select_op* op) +select_acceptor_service::post(scheduler_op* op) { state_->sched_.post(op); } diff --git a/include/boost/corosio/native/detail/select/select_op.hpp b/include/boost/corosio/native/detail/select/select_op.hpp index b9e9912f..677d9105 100644 --- a/include/boost/corosio/native/detail/select/select_op.hpp +++ b/include/boost/corosio/native/detail/select/select_op.hpp @@ -14,385 +14,171 @@ #if BOOST_COROSIO_HAS_SELECT -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include +#include +#include -#include #include #include - -#include -#include -#include -#include -#include - -#include -#include #include -#include +#include /* - select Operation State - ====================== - - Each async I/O operation has a corresponding select_op-derived struct that - holds the operation's state while it's in flight. The socket impl owns - fixed slots for each operation type (conn_, rd_, wr_), so only one - operation of each type can be pending per socket at a time. - - This mirrors the epoll_op design for consistency across backends. - - Completion vs Cancellation Race - ------------------------------- - The `registered` atomic uses a tri-state (unregistered, registering, - registered) to handle two races: (1) between register_fd() and the - reactor seeing an event, and (2) between reactor completion and cancel(). - - The registering state closes the window where an event could arrive - after register_fd() but before the boolean was set. The reactor and - cancel() both treat registering the same as registered when claiming. - - Whoever atomically exchanges to unregistered "claims" the operation - and is responsible for completing it. The loser sees unregistered and - does nothing. The initiating thread uses compare_exchange to transition - from registering to registered; if this fails, the reactor or cancel - already claimed the op. - - Impl Lifetime Management - ------------------------ - When cancel() posts an op to the scheduler's ready queue, the socket impl - might be destroyed before the scheduler processes the op. The `impl_ptr` - member holds a shared_ptr to the impl, keeping it alive until the op - completes. - - EOF Detection - ------------- - For reads, 0 bytes with no error means EOF. But an empty user buffer also - returns 0 bytes. The `empty_buffer_read` flag distinguishes these cases. - - SIGPIPE Prevention - ------------------ - Writes use sendmsg() with MSG_NOSIGNAL instead of writev() to prevent - SIGPIPE when the peer has closed. + File descriptors are registered with the select scheduler once (via + select_descriptor_state) and stay registered until closed. + + select() is level-triggered but the descriptor_state pattern + (designed for edge-triggered) works correctly: is_enqueued_ CAS + prevents double-enqueue, add_ready_events is idempotent, and + EAGAIN ops stay parked until the next select() re-reports readiness. + + cancel() captures shared_from_this() into op.impl_ptr to prevent + use-after-free when the socket is closed with pending ops. + + Writes use sendmsg(MSG_NOSIGNAL) on Linux. On macOS/BSD where + MSG_NOSIGNAL may be absent, SO_NOSIGPIPE is set at socket creation + and accepted-socket setup instead. */ namespace boost::corosio::detail { -// Forward declarations for cancellation support +// Forward declarations class select_socket; class select_acceptor; +struct select_op; -/** Registration state for async operations. +// Forward declaration +class select_scheduler; - Tri-state enum to handle the race between register_fd() and - run_reactor() seeing an event. Setting REGISTERING before - calling register_fd() ensures events delivered during the - registration window are not dropped. -*/ -enum class select_registration_state : std::uint8_t -{ - unregistered, ///< Not registered with reactor - registering, ///< register_fd() called, not yet confirmed - registered ///< Fully registered, ready for events -}; +/// Per-descriptor state for persistent select registration. +struct select_descriptor_state final : reactor_descriptor_state +{}; -struct select_op : scheduler_op +/// select base operation — thin wrapper over reactor_op. +struct select_op : reactor_op { - struct canceller - { - select_op* op; - void operator()() const noexcept; - }; - - std::coroutine_handle<> h; - capy::executor_ref ex; - std::error_code* ec_out = nullptr; - std::size_t* bytes_out = nullptr; - - int fd = -1; - int errn = 0; - std::size_t bytes_transferred = 0; - - std::atomic cancelled{false}; - std::atomic registered{ - select_registration_state::unregistered}; - std::optional> stop_cb; - - // Prevents use-after-free when socket is closed with pending ops. - std::shared_ptr impl_ptr; - - // For stop_token cancellation - pointer to owning socket/acceptor impl. - select_socket* socket_impl_ = nullptr; - select_acceptor* acceptor_impl_ = nullptr; - - select_op() = default; - - void reset() noexcept - { - fd = -1; - errn = 0; - bytes_transferred = 0; - cancelled.store(false, std::memory_order_relaxed); - registered.store( - select_registration_state::unregistered, std::memory_order_relaxed); - impl_ptr.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = nullptr; - } - - void operator()() override - { - stop_cb.reset(); - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else if (is_read_operation() && bytes_transferred == 0) - *ec_out = capy::error::eof; - else - *ec_out = {}; - } - - if (bytes_out) - *bytes_out = bytes_transferred; - - // Move to stack before destroying the frame - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - impl_ptr.reset(); - dispatch_coro(saved_ex, saved_h).resume(); - } - - virtual bool is_read_operation() const noexcept - { - return false; - } - virtual void cancel() noexcept = 0; - - void destroy() override - { - stop_cb.reset(); - impl_ptr.reset(); - } - - void request_cancel() noexcept - { - cancelled.store(true, std::memory_order_release); - } - - void start(std::stop_token const& token) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = nullptr; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void start(std::stop_token const& token, select_socket* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = impl; - acceptor_impl_ = nullptr; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void start(std::stop_token const& token, select_acceptor* impl) - { - cancelled.store(false, std::memory_order_release); - stop_cb.reset(); - socket_impl_ = nullptr; - acceptor_impl_ = impl; - - if (token.stop_possible()) - stop_cb.emplace(token, canceller{this}); - } - - void complete(int err, std::size_t bytes) noexcept - { - errn = err; - bytes_transferred = bytes; - } - - virtual void perform_io() noexcept {} + void operator()() override; }; -struct select_connect_op final : select_op +/// select connect operation. +struct select_connect_op final : reactor_connect_op { - endpoint target_endpoint; - - void reset() noexcept - { - select_op::reset(); - target_endpoint = endpoint{}; - } - - void perform_io() noexcept override - { - // connect() completion status is retrieved via SO_ERROR, not return value - int err = 0; - socklen_t len = sizeof(err); - if (::getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &len) < 0) - err = errno; - complete(err, 0); - } - - // Defined in sockets.cpp where select_socket is complete void operator()() override; void cancel() noexcept override; }; -struct select_read_op final : select_op +/// select scatter-read operation. +struct select_read_op final : reactor_read_op { - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - bool empty_buffer_read = false; - - bool is_read_operation() const noexcept override - { - return !empty_buffer_read; - } - - void reset() noexcept - { - select_op::reset(); - iovec_count = 0; - empty_buffer_read = false; - } - - void perform_io() noexcept override - { - ssize_t n = ::readv(fd, iovecs, iovec_count); - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); - } - void cancel() noexcept override; }; -struct select_write_op final : select_op -{ - static constexpr std::size_t max_buffers = 16; - iovec iovecs[max_buffers]; - int iovec_count = 0; - - void reset() noexcept - { - select_op::reset(); - iovec_count = 0; - } +/** Provides sendmsg() with EINTR retry for select writes. - void perform_io() noexcept override + Uses MSG_NOSIGNAL where available (Linux). On platforms without + it (macOS/BSD), SO_NOSIGPIPE is set at socket creation time + and flags=0 is used here. +*/ +struct select_write_policy +{ + static ssize_t write(int fd, iovec* iovecs, int count) noexcept { msghdr msg{}; msg.msg_iov = iovecs; - msg.msg_iovlen = static_cast(iovec_count); + msg.msg_iovlen = static_cast(count); + +#ifdef MSG_NOSIGNAL + constexpr int send_flags = MSG_NOSIGNAL; +#else + constexpr int send_flags = 0; +#endif - ssize_t n = ::sendmsg(fd, &msg, MSG_NOSIGNAL); - if (n >= 0) - complete(0, static_cast(n)); - else - complete(errno, 0); + ssize_t n; + do + { + n = ::sendmsg(fd, &msg, send_flags); + } + while (n < 0 && errno == EINTR); + return n; } +}; +/// select gather-write operation. +struct select_write_op final : reactor_write_op +{ void cancel() noexcept override; }; -struct select_accept_op final : select_op -{ - int accepted_fd = -1; - io_object::implementation* peer_impl = nullptr; - io_object::implementation** impl_out = nullptr; +/** Provides accept() + fcntl(O_NONBLOCK|FD_CLOEXEC) with FD_SETSIZE check. - void reset() noexcept + Uses accept() instead of accept4() for broader POSIX compatibility. +*/ +struct select_accept_policy +{ + static int do_accept(int fd, sockaddr_storage& peer) noexcept { - select_op::reset(); - accepted_fd = -1; - peer_impl = nullptr; - impl_out = nullptr; - } + socklen_t addrlen = sizeof(peer); + int new_fd; + do + { + new_fd = ::accept(fd, reinterpret_cast(&peer), &addrlen); + } + while (new_fd < 0 && errno == EINTR); - void perform_io() noexcept override - { - sockaddr_storage addr_storage{}; - socklen_t addrlen = sizeof(addr_storage); + if (new_fd < 0) + return new_fd; - // Note: select backend uses accept() + fcntl instead of accept4() - // for broader POSIX compatibility - int new_fd = - ::accept(fd, reinterpret_cast(&addr_storage), &addrlen); + if (new_fd >= FD_SETSIZE) + { + ::close(new_fd); + errno = EINVAL; + return -1; + } + + int flags = ::fcntl(new_fd, F_GETFL, 0); + if (flags == -1) + { + int err = errno; + ::close(new_fd); + errno = err; + return -1; + } + + if (::fcntl(new_fd, F_SETFL, flags | O_NONBLOCK) == -1) + { + int err = errno; + ::close(new_fd); + errno = err; + return -1; + } - if (new_fd >= 0) + if (::fcntl(new_fd, F_SETFD, FD_CLOEXEC) == -1) { - // Reject fds that exceed select()'s FD_SETSIZE limit. - // Better to fail now than during later async operations. - if (new_fd >= FD_SETSIZE) - { - ::close(new_fd); - complete(EINVAL, 0); - return; - } - - // Set non-blocking and close-on-exec flags. - // A non-blocking socket is essential for the async reactor; - // if we can't configure it, fail rather than risk blocking. - int flags = ::fcntl(new_fd, F_GETFL, 0); - if (flags == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - if (::fcntl(new_fd, F_SETFL, flags | O_NONBLOCK) == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - if (::fcntl(new_fd, F_SETFD, FD_CLOEXEC) == -1) - { - int err = errno; - ::close(new_fd); - complete(err, 0); - return; - } - - accepted_fd = new_fd; - complete(0, 0); + int err = errno; + ::close(new_fd); + errno = err; + return -1; } - else + +#ifdef SO_NOSIGPIPE + int one = 1; + if (::setsockopt( + new_fd, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)) == -1) { - complete(errno, 0); + int err = errno; + ::close(new_fd); + errno = err; + return -1; } +#endif + + return new_fd; } +}; - // Defined in acceptors.cpp where select_acceptor is complete +/// select accept operation. +struct select_accept_op final + : reactor_accept_op +{ void operator()() override; void cancel() noexcept override; }; diff --git a/include/boost/corosio/native/detail/select/select_scheduler.hpp b/include/boost/corosio/native/detail/select/select_scheduler.hpp index 14e2ef7d..1b5cf15b 100644 --- a/include/boost/corosio/native/detail/select/select_scheduler.hpp +++ b/include/boost/corosio/native/detail/select/select_scheduler.hpp @@ -17,8 +17,7 @@ #include #include -#include -#include +#include #include #include @@ -27,19 +26,15 @@ #include #include -#include #include -#include #include #include #include -#include #include #include -#include -#include +#include #include #include #include @@ -47,21 +42,18 @@ namespace boost::corosio::detail { struct select_op; +struct select_descriptor_state; /** POSIX scheduler using select() for I/O multiplexing. This scheduler implements the scheduler interface using the POSIX select() - call for I/O event notification. It uses a single reactor model - where one thread runs select() while other threads wait on a condition - variable for handler work. This design provides: - - - Handler parallelism: N posted handlers can execute on N threads - - No thundering herd: condition_variable wakes exactly one thread - - Portability: Works on all POSIX systems + call for I/O event notification. It inherits the shared reactor threading + model from reactor_scheduler_base: signal state machine, inline completion + budget, work counting, and the do_one event loop. The design mirrors epoll_scheduler for behavioral consistency: - Same single-reactor thread coordination model - - Same work counting semantics + - Same deferred I/O pattern (reactor marks ready; workers do I/O) - Same timer integration pattern Known Limitations: @@ -72,13 +64,9 @@ struct select_op; @par Thread Safety All public member functions are thread-safe. */ -class BOOST_COROSIO_DECL select_scheduler final - : public native_scheduler - , public capy::execution_context::service +class BOOST_COROSIO_DECL select_scheduler final : public reactor_scheduler_base { public: - using key_type = scheduler; - /** Construct the scheduler. Creates a self-pipe for reactor interruption. @@ -88,23 +76,14 @@ class BOOST_COROSIO_DECL select_scheduler final */ select_scheduler(capy::execution_context& ctx, int concurrency_hint = -1); + /// Destroy the scheduler. ~select_scheduler() override; select_scheduler(select_scheduler const&) = delete; select_scheduler& operator=(select_scheduler const&) = delete; + /// Shut down the scheduler, draining pending operations. void shutdown() override; - void post(std::coroutine_handle<> h) const override; - void post(scheduler_op* h) const override; - bool running_in_this_thread() const noexcept override; - void stop() override; - bool stopped() const noexcept override; - void restart() override; - std::size_t run() override; - std::size_t run_one() override; - std::size_t wait_one(long usec) override; - std::size_t poll() override; - std::size_t poll_one() override; /** Return the maximum file descriptor value supported. @@ -119,155 +98,52 @@ class BOOST_COROSIO_DECL select_scheduler final return FD_SETSIZE - 1; } - /** Register a file descriptor for monitoring. + /** Register a descriptor for persistent monitoring. + + The fd is added to the registered_descs_ map and will be + included in subsequent select() calls. The reactor is + interrupted so a blocked select() rebuilds its fd_sets. @param fd The file descriptor to register. - @param op The operation associated with this fd. - @param events Event mask: 1 = read, 2 = write, 3 = both. + @param desc Pointer to descriptor state for this fd. */ - void register_fd(int fd, select_op* op, int events) const; + void register_descriptor(int fd, select_descriptor_state* desc) const; - /** Unregister a file descriptor from monitoring. + /** Deregister a persistently registered descriptor. - @param fd The file descriptor to unregister. - @param events Event mask to remove: 1 = read, 2 = write, 3 = both. + @param fd The file descriptor to deregister. */ - void deregister_fd(int fd, int events) const; + void deregister_descriptor(int fd) const; - void work_started() noexcept override; - void work_finished() noexcept override; + /** Interrupt the reactor so it rebuilds its fd_sets. - // Event flags for register_fd/deregister_fd - static constexpr int event_read = 1; - static constexpr int event_write = 2; + Called when a write or connect op is registered after + the reactor's snapshot was taken. Without this, select() + may block not watching for writability on the fd. + */ + void notify_reactor() const; private: - std::size_t do_one(long timeout_us); - void run_reactor(std::unique_lock& lock); - void wake_one_thread_and_unlock(std::unique_lock& lock) const; - void interrupt_reactor() const; + void + run_task(std::unique_lock& lock, context_type* ctx) override; + void interrupt_reactor() const override; long calculate_timeout(long requested_timeout_us) const; // Self-pipe for interrupting select() int pipe_fds_[2]; // [0]=read, [1]=write - mutable std::mutex mutex_; - mutable std::condition_variable wakeup_event_; - mutable op_queue completed_ops_; - mutable std::atomic outstanding_work_; - std::atomic stopped_; - - // Per-fd state for tracking registered operations - struct fd_state - { - select_op* read_op = nullptr; - select_op* write_op = nullptr; - }; - mutable std::unordered_map registered_fds_; + // Per-fd tracking for fd_set building + mutable std::unordered_map registered_descs_; mutable int max_fd_ = -1; - - // Single reactor thread coordination - mutable bool reactor_running_ = false; - mutable bool reactor_interrupted_ = false; - mutable int idle_thread_count_ = 0; - - // Sentinel operation for interleaving reactor runs with handler execution. - // Ensures the reactor runs periodically even when handlers are continuously - // posted, preventing timer starvation. - struct task_op final : scheduler_op - { - void operator()() override {} - void destroy() override {} - }; - task_op task_op_; }; -/* - select Scheduler - Single Reactor Model - ======================================= - - This scheduler mirrors the epoll_scheduler design but uses select() instead - of epoll for I/O multiplexing. The thread coordination strategy is identical: - one thread becomes the "reactor" while others wait on a condition variable. - - Thread Model - ------------ - - ONE thread runs select() at a time (the reactor thread) - - OTHER threads wait on wakeup_event_ (condition variable) for handlers - - When work is posted, exactly one waiting thread wakes via notify_one() - - Key Differences from epoll - -------------------------- - - Uses self-pipe instead of eventfd for interruption (more portable) - - fd_set rebuilding each iteration (O(n) vs O(1) for epoll) - - FD_SETSIZE limit (~1024 fds on most systems) - - Level-triggered only (no edge-triggered mode) - - Self-Pipe Pattern - ----------------- - To interrupt a blocking select() call (e.g., when work is posted or a timer - expires), we write a byte to pipe_fds_[1]. The read end pipe_fds_[0] is - always in the read_fds set, so select() returns immediately. We drain the - pipe to clear the readable state. - - fd-to-op Mapping - ---------------- - We use an unordered_map to track which operations are - registered for each fd. This allows O(1) lookup when select() returns - ready fds. Each fd can have at most one read op and one write op registered. -*/ - -namespace select { - -struct BOOST_COROSIO_SYMBOL_VISIBLE scheduler_context -{ - select_scheduler const* key; - scheduler_context* next; -}; - -inline thread_local_ptr context_stack; - -struct thread_context_guard -{ - scheduler_context frame_; - - explicit thread_context_guard(select_scheduler const* ctx) noexcept - : frame_{ctx, context_stack.get()} - { - context_stack.set(&frame_); - } - - ~thread_context_guard() noexcept - { - context_stack.set(frame_.next); - } -}; - -struct work_guard -{ - select_scheduler* self; - ~work_guard() - { - self->work_finished(); - } -}; - -} // namespace select - inline select_scheduler::select_scheduler(capy::execution_context& ctx, int) : pipe_fds_{-1, -1} - , outstanding_work_(0) - , stopped_(false) , max_fd_(-1) - , reactor_running_(false) - , reactor_interrupted_(false) - , idle_thread_count_(0) { - // Create self-pipe for interrupting select() if (::pipe(pipe_fds_) < 0) detail::throw_system_error(make_err(errno), "pipe"); - // Set both ends to non-blocking and close-on-exec for (int i = 0; i < 2; ++i) { int flags = ::fcntl(pipe_fds_[i], F_GETFL, 0); @@ -300,13 +176,9 @@ inline select_scheduler::select_scheduler(capy::execution_context& ctx, int) static_cast(p)->interrupt_reactor(); })); - // Initialize resolver service get_resolver_service(ctx, *this); - - // Initialize signal service get_signal_service(ctx, *this); - // Push task sentinel to interleave reactor runs with handler execution completed_ops_.push(&task_op_); } @@ -321,265 +193,67 @@ inline select_scheduler::~select_scheduler() inline void select_scheduler::shutdown() { - { - std::unique_lock lock(mutex_); - - while (auto* h = completed_ops_.pop()) - { - if (h == &task_op_) - continue; - lock.unlock(); - h->destroy(); - lock.lock(); - } - } + shutdown_drain(); if (pipe_fds_[1] >= 0) interrupt_reactor(); - - wakeup_event_.notify_all(); } inline void -select_scheduler::post(std::coroutine_handle<> h) const +select_scheduler::register_descriptor( + int fd, select_descriptor_state* desc) const { - struct post_handler final : scheduler_op - { - std::coroutine_handle<> h_; - - explicit post_handler(std::coroutine_handle<> h) : h_(h) {} - - ~post_handler() override = default; - - void operator()() override - { - auto h = h_; - delete this; - h.resume(); - } - - void destroy() override - { - auto h = h_; - delete this; - h.destroy(); - } - }; - - auto ph = std::make_unique(h); - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(ph.release()); - wake_one_thread_and_unlock(lock); -} - -inline void -select_scheduler::post(scheduler_op* h) const -{ - outstanding_work_.fetch_add(1, std::memory_order_relaxed); - - std::unique_lock lock(mutex_); - completed_ops_.push(h); - wake_one_thread_and_unlock(lock); -} - -inline bool -select_scheduler::running_in_this_thread() const noexcept -{ - for (auto* c = select::context_stack.get(); c != nullptr; c = c->next) - if (c->key == this) - return true; - return false; -} - -inline void -select_scheduler::stop() -{ - bool expected = false; - if (stopped_.compare_exchange_strong( - expected, true, std::memory_order_release, - std::memory_order_relaxed)) - { - // Wake all threads so they notice stopped_ and exit - { - std::lock_guard lock(mutex_); - wakeup_event_.notify_all(); - } - interrupt_reactor(); - } -} - -inline bool -select_scheduler::stopped() const noexcept -{ - return stopped_.load(std::memory_order_acquire); -} - -inline void -select_scheduler::restart() -{ - stopped_.store(false, std::memory_order_release); -} - -inline std::size_t -select_scheduler::run() -{ - if (stopped_.load(std::memory_order_acquire)) - return 0; - - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - select::thread_context_guard ctx(this); - - std::size_t n = 0; - while (do_one(-1)) - if (n != (std::numeric_limits::max)()) - ++n; - return n; -} - -inline std::size_t -select_scheduler::run_one() -{ - if (stopped_.load(std::memory_order_acquire)) - return 0; - - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - select::thread_context_guard ctx(this); - return do_one(-1); -} - -inline std::size_t -select_scheduler::wait_one(long usec) -{ - if (stopped_.load(std::memory_order_acquire)) - return 0; - - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - select::thread_context_guard ctx(this); - return do_one(usec); -} - -inline std::size_t -select_scheduler::poll() -{ - if (stopped_.load(std::memory_order_acquire)) - return 0; - - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - stop(); - return 0; - } - - select::thread_context_guard ctx(this); - - std::size_t n = 0; - while (do_one(0)) - if (n != (std::numeric_limits::max)()) - ++n; - return n; -} + if (fd < 0 || fd >= FD_SETSIZE) + detail::throw_system_error(make_err(EINVAL), "select: fd out of range"); -inline std::size_t -select_scheduler::poll_one() -{ - if (stopped_.load(std::memory_order_acquire)) - return 0; + desc->registered_events = reactor_event_read | reactor_event_write; + desc->fd = fd; + desc->scheduler_ = this; + desc->ready_events_.store(0, std::memory_order_relaxed); - if (outstanding_work_.load(std::memory_order_acquire) == 0) { - stop(); - return 0; + std::lock_guard lock(desc->mutex); + desc->impl_ref_.reset(); + desc->read_ready = false; + desc->write_ready = false; } - select::thread_context_guard ctx(this); - return do_one(0); -} - -inline void -select_scheduler::register_fd(int fd, select_op* op, int events) const -{ - // Validate fd is within select() limits - if (fd < 0 || fd >= FD_SETSIZE) - detail::throw_system_error(make_err(EINVAL), "select: fd out of range"); - { std::lock_guard lock(mutex_); - - auto& state = registered_fds_[fd]; - if (events & event_read) - state.read_op = op; - if (events & event_write) - state.write_op = op; - + registered_descs_[fd] = desc; if (fd > max_fd_) max_fd_ = fd; } - // Wake the reactor so a thread blocked in select() rebuilds its fd_sets - // with the newly registered fd. interrupt_reactor(); } inline void -select_scheduler::deregister_fd(int fd, int events) const +select_scheduler::deregister_descriptor(int fd) const { std::lock_guard lock(mutex_); - auto it = registered_fds_.find(fd); - if (it == registered_fds_.end()) + auto it = registered_descs_.find(fd); + if (it == registered_descs_.end()) return; - if (events & event_read) - it->second.read_op = nullptr; - if (events & event_write) - it->second.write_op = nullptr; + registered_descs_.erase(it); - // Remove entry if both are null - if (!it->second.read_op && !it->second.write_op) + if (fd == max_fd_) { - registered_fds_.erase(it); - - // Recalculate max_fd_ if needed - if (fd == max_fd_) + max_fd_ = pipe_fds_[0]; + for (auto& [registered_fd, state] : registered_descs_) { - max_fd_ = pipe_fds_[0]; // At minimum, the pipe read end - for (auto& [registered_fd, state] : registered_fds_) - { - if (registered_fd > max_fd_) - max_fd_ = registered_fd; - } + if (registered_fd > max_fd_) + max_fd_ = registered_fd; } } } inline void -select_scheduler::work_started() noexcept +select_scheduler::notify_reactor() const { - outstanding_work_.fetch_add(1, std::memory_order_relaxed); -} - -inline void -select_scheduler::work_finished() noexcept -{ - if (outstanding_work_.fetch_sub(1, std::memory_order_acq_rel) == 1) - stop(); + interrupt_reactor(); } inline void @@ -589,30 +263,6 @@ select_scheduler::interrupt_reactor() const [[maybe_unused]] auto r = ::write(pipe_fds_[1], &byte, 1); } -inline void -select_scheduler::wake_one_thread_and_unlock( - std::unique_lock& lock) const -{ - if (idle_thread_count_ > 0) - { - // Idle worker exists - wake it via condvar - wakeup_event_.notify_one(); - lock.unlock(); - } - else if (reactor_running_ && !reactor_interrupted_) - { - // No idle workers but reactor is running - interrupt it - reactor_interrupted_ = true; - lock.unlock(); - interrupt_reactor(); - } - else - { - // No one to wake - lock.unlock(); - } -} - inline long select_scheduler::calculate_timeout(long requested_timeout_us) const { @@ -631,7 +281,6 @@ select_scheduler::calculate_timeout(long requested_timeout_us) const std::chrono::duration_cast(nearest - now) .count(); - // Clamp to [0, LONG_MAX] to prevent truncation on 32-bit long platforms constexpr auto long_max = static_cast((std::numeric_limits::max)()); auto capped_timer_us = @@ -642,45 +291,68 @@ select_scheduler::calculate_timeout(long requested_timeout_us) const if (requested_timeout_us < 0) return static_cast(capped_timer_us); - // requested_timeout_us is already long, so min() result fits in long return static_cast( (std::min)(static_cast(requested_timeout_us), capped_timer_us)); } inline void -select_scheduler::run_reactor(std::unique_lock& lock) +select_scheduler::run_task( + std::unique_lock& lock, context_type* ctx) { - // Calculate timeout considering timers, use 0 if interrupted - long effective_timeout_us = - reactor_interrupted_ ? 0 : calculate_timeout(-1); + long effective_timeout_us = task_interrupted_ ? 0 : calculate_timeout(-1); + + // Snapshot registered descriptors while holding lock. + // Record which fds need write monitoring to avoid a hot loop: + // select is level-triggered so writable sockets (nearly always + // writable) would cause select() to return immediately every + // iteration if unconditionally added to write_fds. + struct fd_entry + { + int fd; + select_descriptor_state* desc; + bool needs_write; + }; + fd_entry snapshot[FD_SETSIZE]; + int snapshot_count = 0; + + for (auto& [fd, desc] : registered_descs_) + { + if (snapshot_count < FD_SETSIZE) + { + std::lock_guard desc_lock(desc->mutex); + snapshot[snapshot_count].fd = fd; + snapshot[snapshot_count].desc = desc; + snapshot[snapshot_count].needs_write = + (desc->write_op || desc->connect_op); + ++snapshot_count; + } + } + + if (lock.owns_lock()) + lock.unlock(); + + task_cleanup on_exit{this, &lock, ctx}; - // Build fd_sets from registered_fds_ fd_set read_fds, write_fds, except_fds; FD_ZERO(&read_fds); FD_ZERO(&write_fds); FD_ZERO(&except_fds); - // Always include the interrupt pipe FD_SET(pipe_fds_[0], &read_fds); int nfds = pipe_fds_[0]; - // Add registered fds - for (auto& [fd, state] : registered_fds_) + for (int i = 0; i < snapshot_count; ++i) { - if (state.read_op) - FD_SET(fd, &read_fds); - if (state.write_op) - { + int fd = snapshot[i].fd; + FD_SET(fd, &read_fds); + if (snapshot[i].needs_write) FD_SET(fd, &write_fds); - // Also monitor for errors on connect operations - FD_SET(fd, &except_fds); - } + FD_SET(fd, &except_fds); if (fd > nfds) nfds = fd; } - // Convert timeout to timeval struct timeval tv; struct timeval* tv_ptr = nullptr; if (effective_timeout_us >= 0) @@ -690,197 +362,65 @@ select_scheduler::run_reactor(std::unique_lock& lock) tv_ptr = &tv; } - lock.unlock(); - int ready = ::select(nfds + 1, &read_fds, &write_fds, &except_fds, tv_ptr); - int saved_errno = errno; + + // EINTR: signal interrupted select(), just retry. + // EBADF: an fd was closed between snapshot and select(); retry + // with a fresh snapshot from registered_descs_. + if (ready < 0) + { + if (errno == EINTR || errno == EBADF) + return; + detail::throw_system_error(make_err(errno), "select"); + } // Process timers outside the lock timer_svc_->process_expired(); - if (ready < 0 && saved_errno != EINTR) - detail::throw_system_error(make_err(saved_errno), "select"); + op_queue local_ops; - // Re-acquire lock before modifying completed_ops_ - lock.lock(); - - // Drain the interrupt pipe if readable - if (ready > 0 && FD_ISSET(pipe_fds_[0], &read_fds)) - { - char buf[256]; - while (::read(pipe_fds_[0], buf, sizeof(buf)) > 0) - { - } - } - - // Process I/O completions - int completions_queued = 0; if (ready > 0) { - // Iterate over registered fds (copy keys to avoid iterator invalidation) - std::vector fds_to_check; - fds_to_check.reserve(registered_fds_.size()); - for (auto& [fd, state] : registered_fds_) - fds_to_check.push_back(fd); - - for (int fd : fds_to_check) + if (FD_ISSET(pipe_fds_[0], &read_fds)) { - auto it = registered_fds_.find(fd); - if (it == registered_fds_.end()) - continue; - - auto& state = it->second; - - // Check for errors (especially for connect operations) - bool has_error = FD_ISSET(fd, &except_fds); - - // Process read readiness - if (state.read_op && (FD_ISSET(fd, &read_fds) || has_error)) - { - auto* op = state.read_op; - // Claim the op by exchanging to unregistered. Both registering and - // registered states mean the op is ours to complete. - auto prev = op->registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - state.read_op = nullptr; - - if (has_error) - { - int errn = 0; - socklen_t len = sizeof(errn); - if (::getsockopt( - fd, SOL_SOCKET, SO_ERROR, &errn, &len) < 0) - errn = errno; - if (errn == 0) - errn = EIO; - op->complete(errn, 0); - } - else - { - op->perform_io(); - } - - completed_ops_.push(op); - ++completions_queued; - } - } - - // Process write readiness - if (state.write_op && (FD_ISSET(fd, &write_fds) || has_error)) + char buf[256]; + while (::read(pipe_fds_[0], buf, sizeof(buf)) > 0) { - auto* op = state.write_op; - // Claim the op by exchanging to unregistered. Both registering and - // registered states mean the op is ours to complete. - auto prev = op->registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - state.write_op = nullptr; - - if (has_error) - { - int errn = 0; - socklen_t len = sizeof(errn); - if (::getsockopt( - fd, SOL_SOCKET, SO_ERROR, &errn, &len) < 0) - errn = errno; - if (errn == 0) - errn = EIO; - op->complete(errn, 0); - } - else - { - op->perform_io(); - } - - completed_ops_.push(op); - ++completions_queued; - } } - - // Clean up empty entries - if (!state.read_op && !state.write_op) - registered_fds_.erase(it); } - } - - if (completions_queued > 0) - { - if (completions_queued == 1) - wakeup_event_.notify_one(); - else - wakeup_event_.notify_all(); - } -} - -inline std::size_t -select_scheduler::do_one(long timeout_us) -{ - std::unique_lock lock(mutex_); - - for (;;) - { - if (stopped_.load(std::memory_order_acquire)) - return 0; - - scheduler_op* op = completed_ops_.pop(); - if (op == &task_op_) + for (int i = 0; i < snapshot_count; ++i) { - bool more_handlers = !completed_ops_.empty(); + int fd = snapshot[i].fd; + select_descriptor_state* desc = snapshot[i].desc; + + std::uint32_t flags = 0; + if (FD_ISSET(fd, &read_fds)) + flags |= reactor_event_read; + if (FD_ISSET(fd, &write_fds)) + flags |= reactor_event_write; + if (FD_ISSET(fd, &except_fds)) + flags |= reactor_event_error; + + if (flags == 0) + continue; + + desc->add_ready_events(flags); - if (!more_handlers) + bool expected = false; + if (desc->is_enqueued_.compare_exchange_strong( + expected, true, std::memory_order_release, + std::memory_order_relaxed)) { - if (outstanding_work_.load(std::memory_order_acquire) == 0) - { - completed_ops_.push(&task_op_); - return 0; - } - if (timeout_us == 0) - { - completed_ops_.push(&task_op_); - return 0; - } + local_ops.push(desc); } - - reactor_interrupted_ = more_handlers || timeout_us == 0; - reactor_running_ = true; - - if (more_handlers && idle_thread_count_ > 0) - wakeup_event_.notify_one(); - - run_reactor(lock); - - reactor_running_ = false; - completed_ops_.push(&task_op_); - continue; } + } - if (op != nullptr) - { - lock.unlock(); - select::work_guard g{this}; - (*op)(); - return 1; - } - - if (outstanding_work_.load(std::memory_order_acquire) == 0) - return 0; - - if (timeout_us == 0) - return 0; + lock.lock(); - ++idle_thread_count_; - if (timeout_us < 0) - wakeup_event_.wait(lock); - else - wakeup_event_.wait_for(lock, std::chrono::microseconds(timeout_us)); - --idle_thread_count_; - } + if (!local_ops.empty()) + completed_ops_.splice(local_ops); } } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/select/select_socket.hpp b/include/boost/corosio/native/detail/select/select_socket.hpp index ff0c295e..28c425fa 100644 --- a/include/boost/corosio/native/detail/select/select_socket.hpp +++ b/include/boost/corosio/native/detail/select/select_socket.hpp @@ -14,13 +14,9 @@ #if BOOST_COROSIO_HAS_SELECT -#include -#include -#include - +#include #include - -#include +#include namespace boost::corosio::detail { @@ -28,14 +24,20 @@ class select_socket_service; /// Socket implementation for select backend. class select_socket final - : public tcp_socket::implementation - , public std::enable_shared_from_this - , public intrusive_list::node + : public reactor_socket< + select_socket, + select_socket_service, + select_op, + select_connect_op, + select_read_op, + select_write_op, + select_descriptor_state> { friend class select_socket_service; public: explicit select_socket(select_socket_service& svc) noexcept; + ~select_socket() override; std::coroutine_handle<> connect( std::coroutine_handle<>, @@ -60,56 +62,8 @@ class select_socket final std::error_code*, std::size_t*) override; - std::error_code shutdown(tcp_socket::shutdown_type what) noexcept override; - - native_handle_type native_handle() const noexcept override - { - return fd_; - } - - std::error_code set_option( - int level, - int optname, - void const* data, - std::size_t size) noexcept override; - std::error_code - get_option(int level, int optname, void* data, std::size_t* size) - const noexcept override; - - endpoint local_endpoint() const noexcept override - { - return local_endpoint_; - } - endpoint remote_endpoint() const noexcept override - { - return remote_endpoint_; - } - bool is_open() const noexcept - { - return fd_ >= 0; - } void cancel() noexcept override; - void cancel_single_op(select_op& op) noexcept; void close_socket() noexcept; - void set_socket(int fd) noexcept - { - fd_ = fd; - } - void set_endpoints(endpoint local, endpoint remote) noexcept - { - local_endpoint_ = local; - remote_endpoint_ = remote; - } - - select_connect_op conn_; - select_read_op rd_; - select_write_op wr_; - -private: - select_socket_service& svc_; - int fd_ = -1; - endpoint local_endpoint_; - endpoint remote_endpoint_; }; } // namespace boost::corosio::detail diff --git a/include/boost/corosio/native/detail/select/select_socket_service.hpp b/include/boost/corosio/native/detail/select/select_socket_service.hpp index 0a752e30..379480f3 100644 --- a/include/boost/corosio/native/detail/select/select_socket_service.hpp +++ b/include/boost/corosio/native/detail/select/select_socket_service.hpp @@ -20,81 +20,35 @@ #include #include +#include -#include -#include -#include +#include -#include - -#include +#include +#include +#include #include #include #include #include +#include #include #include -#include -#include -#include - /* - select Socket Implementation - ============================ - - This mirrors the epoll_sockets design for behavioral consistency. - Each I/O operation follows the same pattern: - 1. Try the syscall immediately (non-blocking socket) - 2. If it succeeds or fails with a real error, post to completion queue - 3. If EAGAIN/EWOULDBLOCK, register with select scheduler and wait - - Cancellation - ------------ - See op.hpp for the completion/cancellation race handling via the - `registered` atomic. cancel() must complete pending operations (post - them with cancelled flag) so coroutines waiting on them can resume. - close_socket() calls cancel() first to ensure this. - - Impl Lifetime with shared_ptr - ----------------------------- - Socket impls use enable_shared_from_this. The service owns impls via - shared_ptr maps (socket_ptrs_) keyed by raw pointer for O(1) lookup and - removal. When a user calls close(), we call cancel() which posts pending - ops to the scheduler. - - CRITICAL: The posted ops must keep the impl alive until they complete. - Otherwise the scheduler would process a freed op (use-after-free). The - cancel() method captures shared_from_this() into op.impl_ptr before - posting. When the op completes, impl_ptr is cleared, allowing the impl - to be destroyed if no other references exist. - - Service Ownership - ----------------- - select_socket_service owns all socket impls. destroy() removes the - shared_ptr from the map, but the impl may survive if ops still hold - impl_ptr refs. shutdown() closes all sockets and clears the map; any - in-flight ops will complete and release their refs. + Each I/O op tries the syscall speculatively; only registers with + the reactor on EAGAIN. Fd is registered once at open time and + stays registered until close. The reactor only marks ready_events_; + actual I/O happens in invoke_deferred_io(). cancel() captures + shared_from_this() into op.impl_ptr to keep the impl alive. */ namespace boost::corosio::detail { -/** State for select socket service. */ -class select_socket_state -{ -public: - explicit select_socket_state(select_scheduler& sched) noexcept - : sched_(sched) - { - } - - select_scheduler& sched_; - std::mutex mutex_; - intrusive_list socket_list_; - std::unordered_map> - socket_ptrs_; -}; +/// State for select socket service. +using select_socket_state = + reactor_service_state; /** select socket service implementation. @@ -125,7 +79,7 @@ class BOOST_COROSIO_DECL select_socket_service final : public socket_service { return state_->sched_; } - void post(select_op* op); + void post(scheduler_op* op); void work_started() noexcept; void work_finished() noexcept; @@ -133,15 +87,6 @@ class BOOST_COROSIO_DECL select_socket_service final : public socket_service std::unique_ptr state_; }; -// Backward compatibility alias -using select_sockets = select_socket_service; - -inline void -select_op::canceller::operator()() const noexcept -{ - op->cancel(); -} - inline void select_connect_op::cancel() noexcept { @@ -170,51 +115,24 @@ select_write_op::cancel() noexcept } inline void -select_connect_op::operator()() +select_op::operator()() { - stop_cb.reset(); - - bool success = (errn == 0 && !cancelled.load(std::memory_order_acquire)); - - // Cache endpoints on successful connect - if (success && socket_impl_) - { - endpoint local_ep; - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd, reinterpret_cast(&local_storage), &local_len) == - 0) - local_ep = from_sockaddr(local_storage); - static_cast(socket_impl_) - ->set_endpoints(local_ep, target_endpoint); - } - - if (ec_out) - { - if (cancelled.load(std::memory_order_acquire)) - *ec_out = capy::error::canceled; - else if (errn != 0) - *ec_out = make_err(errn); - else - *ec_out = {}; - } - - if (bytes_out) - *bytes_out = bytes_transferred; + complete_io_op(*this); +} - // Move to stack before destroying the frame - capy::executor_ref saved_ex(ex); - std::coroutine_handle<> saved_h(h); - impl_ptr.reset(); - dispatch_coro(saved_ex, saved_h).resume(); +inline void +select_connect_op::operator()() +{ + complete_connect_op(*this); } inline select_socket::select_socket(select_socket_service& svc) noexcept - : svc_(svc) + : reactor_socket(svc) { } +inline select_socket::~select_socket() = default; + inline std::coroutine_handle<> select_socket::connect( std::coroutine_handle<> h, @@ -223,88 +141,11 @@ select_socket::connect( std::stop_token token, std::error_code* ec) { - auto& op = conn_; - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.fd = fd_; - op.target_endpoint = ep; // Store target for endpoint caching - op.start(token, this); - - sockaddr_storage storage{}; - socklen_t addrlen = - detail::to_sockaddr(ep, detail::socket_family(fd_), storage); - int result = ::connect(fd_, reinterpret_cast(&storage), addrlen); - - if (result == 0) - { - // Sync success — cache endpoints immediately - sockaddr_storage local_storage{}; - socklen_t local_len = sizeof(local_storage); - if (::getsockname( - fd_, reinterpret_cast(&local_storage), &local_len) == - 0) - local_endpoint_ = detail::from_sockaddr(local_storage); - remote_endpoint_ = ep; - - op.complete(0, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - // completion is always posted to scheduler queue, never inline. - return std::noop_coroutine(); - } - - if (errno == EINPROGRESS) - { - svc_.work_started(); - op.impl_ptr = shared_from_this(); - - // Set registering BEFORE register_fd to close the race window where - // reactor sees an event before we set registered. The reactor treats - // registering the same as registered when claiming the op. - op.registered.store( - select_registration_state::registering, std::memory_order_release); - svc_.scheduler().register_fd(fd_, &op, select_scheduler::event_write); - - // Transition to registered. If this fails, reactor or cancel already - // claimed the op (state is now unregistered), so we're done. However, - // we must still deregister the fd because cancel's deregister_fd may - // have run before our register_fd, leaving the fd orphaned. - auto expected = select_registration_state::registering; - if (!op.registered.compare_exchange_strong( - expected, select_registration_state::registered, - std::memory_order_acq_rel)) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_write); - // completion is always posted to scheduler queue, never inline. - return std::noop_coroutine(); - } - - // If cancelled was set before we registered, handle it now. - if (op.cancelled.load(std::memory_order_acquire)) - { - auto prev = op.registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd( - fd_, select_scheduler::event_write); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - svc_.work_finished(); - } - } - // completion is always posted to scheduler queue, never inline. - return std::noop_coroutine(); - } - - op.complete(errno, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - // completion is always posted to scheduler queue, never inline. - return std::noop_coroutine(); + auto result = do_connect(h, ex, ep, token, ec); + // Rebuild fd_sets so select() watches for writability + if (result == std::noop_coroutine()) + svc_.scheduler().notify_reactor(); + return result; } inline std::coroutine_handle<> @@ -316,98 +157,7 @@ select_socket::read_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = rd_; - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - - capy::mutable_buffer bufs[select_read_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, select_read_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.empty_buffer_read = true; - op.complete(0, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - ssize_t n = ::readv(fd_, op.iovecs, op.iovec_count); - - if (n > 0) - { - op.complete(0, static_cast(n)); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); - } - - if (n == 0) - { - op.complete(0, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); - } - - if (errno == EAGAIN || errno == EWOULDBLOCK) - { - svc_.work_started(); - op.impl_ptr = shared_from_this(); - - // Set registering BEFORE register_fd to close the race window where - // reactor sees an event before we set registered. - op.registered.store( - select_registration_state::registering, std::memory_order_release); - svc_.scheduler().register_fd(fd_, &op, select_scheduler::event_read); - - // Transition to registered. If this fails, reactor or cancel already - // claimed the op (state is now unregistered), so we're done. However, - // we must still deregister the fd because cancel's deregister_fd may - // have run before our register_fd, leaving the fd orphaned. - auto expected = select_registration_state::registering; - if (!op.registered.compare_exchange_strong( - expected, select_registration_state::registered, - std::memory_order_acq_rel)) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_read); - return std::noop_coroutine(); - } - - // If cancelled was set before we registered, handle it now. - if (op.cancelled.load(std::memory_order_acquire)) - { - auto prev = op.registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd( - fd_, select_scheduler::event_read); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - svc_.work_finished(); - } - } - return std::noop_coroutine(); - } - - op.complete(errno, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); + return do_read_some(h, ex, param, token, ec, bytes_out); } inline std::coroutine_handle<> @@ -419,228 +169,23 @@ select_socket::write_some( std::error_code* ec, std::size_t* bytes_out) { - auto& op = wr_; - op.reset(); - op.h = h; - op.ex = ex; - op.ec_out = ec; - op.bytes_out = bytes_out; - op.fd = fd_; - op.start(token, this); - - capy::mutable_buffer bufs[select_write_op::max_buffers]; - op.iovec_count = - static_cast(param.copy_to(bufs, select_write_op::max_buffers)); - - if (op.iovec_count == 0 || (op.iovec_count == 1 && bufs[0].size() == 0)) - { - op.complete(0, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); - } - - for (int i = 0; i < op.iovec_count; ++i) - { - op.iovecs[i].iov_base = bufs[i].data(); - op.iovecs[i].iov_len = bufs[i].size(); - } - - msghdr msg{}; - msg.msg_iov = op.iovecs; - msg.msg_iovlen = static_cast(op.iovec_count); - - ssize_t n = ::sendmsg(fd_, &msg, MSG_NOSIGNAL); - - if (n > 0) - { - op.complete(0, static_cast(n)); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); - } - - if (errno == EAGAIN || errno == EWOULDBLOCK) - { - svc_.work_started(); - op.impl_ptr = shared_from_this(); - - // Set registering BEFORE register_fd to close the race window where - // reactor sees an event before we set registered. - op.registered.store( - select_registration_state::registering, std::memory_order_release); - svc_.scheduler().register_fd(fd_, &op, select_scheduler::event_write); - - // Transition to registered. If this fails, reactor or cancel already - // claimed the op (state is now unregistered), so we're done. However, - // we must still deregister the fd because cancel's deregister_fd may - // have run before our register_fd, leaving the fd orphaned. - auto expected = select_registration_state::registering; - if (!op.registered.compare_exchange_strong( - expected, select_registration_state::registered, - std::memory_order_acq_rel)) - { - svc_.scheduler().deregister_fd(fd_, select_scheduler::event_write); - return std::noop_coroutine(); - } - - // If cancelled was set before we registered, handle it now. - if (op.cancelled.load(std::memory_order_acquire)) - { - auto prev = op.registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd( - fd_, select_scheduler::event_write); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - svc_.work_finished(); - } - } - return std::noop_coroutine(); - } - - op.complete(errno ? errno : EIO, 0); - op.impl_ptr = shared_from_this(); - svc_.post(&op); - return std::noop_coroutine(); -} - -inline std::error_code -select_socket::shutdown(tcp_socket::shutdown_type what) noexcept -{ - int how; - switch (what) - { - case tcp_socket::shutdown_receive: - how = SHUT_RD; - break; - case tcp_socket::shutdown_send: - how = SHUT_WR; - break; - case tcp_socket::shutdown_both: - how = SHUT_RDWR; - break; - default: - return make_err(EINVAL); - } - if (::shutdown(fd_, how) != 0) - return make_err(errno); - return {}; -} - -inline std::error_code -select_socket::set_option( - int level, int optname, void const* data, std::size_t size) noexcept -{ - if (::setsockopt(fd_, level, optname, data, static_cast(size)) != - 0) - return make_err(errno); - return {}; -} - -inline std::error_code -select_socket::get_option( - int level, int optname, void* data, std::size_t* size) const noexcept -{ - socklen_t len = static_cast(*size); - if (::getsockopt(fd_, level, optname, data, &len) != 0) - return make_err(errno); - *size = static_cast(len); - return {}; + auto result = do_write_some(h, ex, param, token, ec, bytes_out); + // Rebuild fd_sets so select() watches for writability + if (result == std::noop_coroutine()) + svc_.scheduler().notify_reactor(); + return result; } inline void select_socket::cancel() noexcept { - auto self = weak_from_this().lock(); - if (!self) - return; - - auto cancel_op = [this, &self](select_op& op, int events) { - auto prev = op.registered.exchange( - select_registration_state::unregistered, std::memory_order_acq_rel); - op.request_cancel(); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd(fd_, events); - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } - }; - - cancel_op(conn_, select_scheduler::event_write); - cancel_op(rd_, select_scheduler::event_read); - cancel_op(wr_, select_scheduler::event_write); -} - -inline void -select_socket::cancel_single_op(select_op& op) noexcept -{ - auto self = weak_from_this().lock(); - if (!self) - return; - - // Called from stop_token callback to cancel a specific pending operation. - auto prev = op.registered.exchange( - select_registration_state::unregistered, std::memory_order_acq_rel); - op.request_cancel(); - - if (prev != select_registration_state::unregistered) - { - // Determine which event type to deregister - int events = 0; - if (&op == &conn_ || &op == &wr_) - events = select_scheduler::event_write; - else if (&op == &rd_) - events = select_scheduler::event_read; - - svc_.scheduler().deregister_fd(fd_, events); - - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } + do_cancel(); } inline void select_socket::close_socket() noexcept { - auto self = weak_from_this().lock(); - if (self) - { - auto cancel_op = [this, &self](select_op& op, int events) { - auto prev = op.registered.exchange( - select_registration_state::unregistered, - std::memory_order_acq_rel); - op.request_cancel(); - if (prev != select_registration_state::unregistered) - { - svc_.scheduler().deregister_fd(fd_, events); - op.impl_ptr = self; - svc_.post(&op); - svc_.work_finished(); - } - }; - - cancel_op(conn_, select_scheduler::event_write); - cancel_op(rd_, select_scheduler::event_read); - cancel_op(wr_, select_scheduler::event_write); - } - - if (fd_ >= 0) - { - svc_.scheduler().deregister_fd( - fd_, select_scheduler::event_read | select_scheduler::event_write); - ::close(fd_); - fd_ = -1; - } - - local_endpoint_ = endpoint{}; - remote_endpoint_ = endpoint{}; + do_close_socket(); } inline select_socket_service::select_socket_service( @@ -658,10 +203,10 @@ select_socket_service::shutdown() { std::lock_guard lock(state_->mutex_); - while (auto* impl = state_->socket_list_.pop_front()) + while (auto* impl = state_->impl_list_.pop_front()) impl->close_socket(); - // Don't clear socket_ptrs_ here. The scheduler shuts down after us and + // Don't clear impl_ptrs_ here. The scheduler shuts down after us and // drains completed_ops_, calling destroy() on each queued op. Letting // ~state_ release the ptrs (during service destruction, after scheduler // shutdown) keeps every impl alive until all ops have been drained. @@ -675,8 +220,8 @@ select_socket_service::construct() { std::lock_guard lock(state_->mutex_); - state_->socket_list_.push_back(raw); - state_->socket_ptrs_.emplace(raw, std::move(impl)); + state_->impl_ptrs_.emplace(raw, std::move(impl)); + state_->impl_list_.push_back(raw); } return raw; @@ -688,8 +233,8 @@ select_socket_service::destroy(io_object::implementation* impl) auto* select_impl = static_cast(impl); select_impl->close_socket(); std::lock_guard lock(state_->mutex_); - state_->socket_list_.remove(select_impl); - state_->socket_ptrs_.erase(select_impl); + state_->impl_list_.remove(select_impl); + state_->impl_ptrs_.erase(select_impl); } inline std::error_code @@ -709,7 +254,6 @@ select_socket_service::open_socket( ::setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); } - // Set non-blocking and close-on-exec int flags = ::fcntl(fd, F_GETFL, 0); if (flags == -1) { @@ -730,14 +274,30 @@ select_socket_service::open_socket( return make_err(errn); } - // Check fd is within select() limits if (fd >= FD_SETSIZE) { ::close(fd); - return make_err(EMFILE); // Too many open files + return make_err(EMFILE); } +#ifdef SO_NOSIGPIPE + { + int one = 1; + ::setsockopt(fd, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one)); + } +#endif + select_impl->fd_ = fd; + + select_impl->desc_state_.fd = fd; + { + std::lock_guard lock(select_impl->desc_state_.mutex); + select_impl->desc_state_.read_op = nullptr; + select_impl->desc_state_.write_op = nullptr; + select_impl->desc_state_.connect_op = nullptr; + } + scheduler().register_descriptor(fd, &select_impl->desc_state_); + return {}; } @@ -748,7 +308,7 @@ select_socket_service::close(io_object::handle& h) } inline void -select_socket_service::post(select_op* op) +select_socket_service::post(scheduler_op* op) { state_->sched_.post(op); }