diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 5f3d0b2ccc..6ce4a2a0d7 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -57,7 +57,7 @@ set +e export RAPIDS_TESTS_DIR rapids-logger "Run gtests" -timeout 50m ./ci/run_ctests.sh || FAILED_STEPS+=("gtests (run_ctests.sh)") +timeout 60m ./ci/run_ctests.sh || FAILED_STEPS+=("gtests (run_ctests.sh)") rapids-logger "Generate nightly test report" source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh index ed616e255f..a603c69098 100755 --- a/ci/validate_wheel.sh +++ b/ci/validate_wheel.sh @@ -22,7 +22,7 @@ PYDISTCHECK_ARGS=( if [[ "${package_dir}" == "python/libcuopt" ]]; then if [[ "${RAPIDS_CUDA_MAJOR}" == "12" ]]; then PYDISTCHECK_ARGS+=( - --max-allowed-size-compressed '680Mi' + --max-allowed-size-compressed '690Mi' ) else PYDISTCHECK_ARGS+=( diff --git a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp index 922e0dfc5f..76258ec0bd 100644 --- a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -130,16 +131,11 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t constraints) override; void add_quadratic_constraint(char constraint_row_type, f_t rhs_value, - const f_t* quadratic_values, - i_t size_quadratic_values, - const i_t* quadratic_indices, - i_t size_quadratic_indices, - const i_t* quadratic_offsets, - i_t size_quadratic_offsets, - const f_t* linear_values, - i_t size_linear_values, - const i_t* linear_indices, - i_t size_linear_indices) override; + std::span row_index, + std::span col_index, + std::span coeff, + std::span linear_values, + std::span linear_indices) override; bool has_quadratic_constraints() const override; const std::vector& get_quadratic_constraints() const override; diff --git a/cpp/include/cuopt/linear_programming/io/data_model_view.hpp b/cpp/include/cuopt/linear_programming/io/data_model_view.hpp index ca2fd30393..4d62458263 100644 --- a/cpp/include/cuopt/linear_programming/io/data_model_view.hpp +++ b/cpp/include/cuopt/linear_programming/io/data_model_view.hpp @@ -434,9 +434,9 @@ class data_model_view_t { std::vector(qc.linear_values.begin(), qc.linear_values.end()), std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), static_cast(qc.rhs_value), - std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), - std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), - std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + std::vector(qc.rows.begin(), qc.rows.end()), + std::vector(qc.cols.begin(), qc.cols.end()), + std::vector(qc.vals.begin(), qc.vals.end())}); } } diff --git a/cpp/include/cuopt/linear_programming/io/mps_data_model.hpp b/cpp/include/cuopt/linear_programming/io/mps_data_model.hpp index 9828a00c0c..145dee51ef 100644 --- a/cpp/include/cuopt/linear_programming/io/mps_data_model.hpp +++ b/cpp/include/cuopt/linear_programming/io/mps_data_model.hpp @@ -239,7 +239,7 @@ class mps_data_model_t { * - row identity and type (from ROWS), * - sparse linear coefficients (from COLUMNS), * - RHS value (from RHS), - * - quadratic matrix Q in CSR (from QCMATRIX). + * - quadratic matrix Q in COO (SoA: row, col, value) from QCMATRIX — one triplet per nonzero. */ struct quadratic_constraint_t { /** ROWS declaration index (among all constraint rows), not an index into the linear CSR. */ @@ -251,19 +251,21 @@ class mps_data_model_t { std::vector linear_values{}; std::vector linear_indices{}; f_t rhs_value{f_t(0)}; - std::vector quadratic_values{}; - std::vector quadratic_indices{}; - std::vector quadratic_offsets{}; + /** Q nonzeros: parallel arrays, same length (COO / SoA). Sorted by (row, col) in append. */ + std::vector rows{}; + std::vector cols{}; + std::vector vals{}; }; /** * @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). * @note All span inputs are host memory; the model copies this data. * @param linear_values, linear_indices Same nnz; can be empty for a purely quadratic row (rare). - * @param quadratic_values, quadratic_indices CSR nnz; may be empty if Q is empty. - * @param quadratic_offsets CSR row starts; must be non-empty. - * @param constraint_row_type MPS ROWS type; must be 'L'. 'G' and 'E' quadratic rows are not - * supported. + * @param vals, rows, cols COO triplets for Q; same length; may all be empty if Q is empty. + * Stored sorted by (row, col). + * @param constraint_row_type MPS ROWS type: 'L' (<=) or 'G' (>=). Stored as given; 'G' rows are + * converted to '<=' form when building the SOCP for the barrier solver. Equality ('E') is + * not supported. */ void append_quadratic_constraint(i_t constraint_row_index, const std::string& constraint_row_name, @@ -271,9 +273,9 @@ class mps_data_model_t { std::span linear_values, std::span linear_indices, f_t rhs_value, - std::span quadratic_values, - std::span quadratic_indices, - std::span quadratic_offsets); + std::span vals, + std::span rows, + std::span cols); const std::vector& get_quadratic_constraints() const; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem.hpp b/cpp/include/cuopt/linear_programming/optimization_problem.hpp index 238ff42c22..1b9302978b 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem.hpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -212,16 +213,11 @@ class optimization_problem_t : public optimization_problem_interface_t void set_quadratic_constraints(std::vector constraints) override; void add_quadratic_constraint(char constraint_row_type, f_t rhs_value, - const f_t* quadratic_values, - i_t size_quadratic_values, - const i_t* quadratic_indices, - i_t size_quadratic_indices, - const i_t* quadratic_offsets, - i_t size_quadratic_offsets, - const f_t* linear_values, - i_t size_linear_values, - const i_t* linear_indices, - i_t size_linear_indices) override; + std::span row_index, + std::span col_index, + std::span coeff, + std::span linear_values, + std::span linear_indices) override; /** @copydoc optimization_problem_interface_t::set_variable_lower_bounds */ void set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) override; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index c2086d0b75..e39d180d7a 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -64,9 +65,10 @@ class optimization_problem_interface_t { std::vector linear_values{}; std::vector linear_indices{}; f_t rhs_value{f_t(0)}; - std::vector quadratic_values{}; - std::vector quadratic_indices{}; - std::vector quadratic_offsets{}; + /** Q in COO: parallel arrays, same length. */ + std::vector rows{}; + std::vector cols{}; + std::vector vals{}; }; virtual ~optimization_problem_interface_t() = default; @@ -79,22 +81,17 @@ class optimization_problem_interface_t { /** * @brief Append one quadratic constraint x^T Q x + d^T x {<=, >=} rhs. * - * Quadratic matrix Q is CSR (values, indices, offsets). Linear term d uses parallel + * Quadratic matrix Q is COO (row_index, col_index, coeff spans). Linear term d uses parallel * linear_values and linear_indices (empty allowed). constraint_row_index is assigned * automatically as n_linear_constraints + n_existing_quadratic_constraints. */ virtual void add_quadratic_constraint(char constraint_row_type, f_t rhs_value, - const f_t* quadratic_values, - i_t size_quadratic_values, - const i_t* quadratic_indices, - i_t size_quadratic_indices, - const i_t* quadratic_offsets, - i_t size_quadratic_offsets, - const f_t* linear_values, - i_t size_linear_values, - const i_t* linear_indices, - i_t size_linear_indices) = 0; + std::span row_index, + std::span col_index, + std::span coeff, + std::span linear_values, + std::span linear_indices) = 0; template >> void set_quadratic_constraints(const std::vector& constraints) @@ -109,9 +106,9 @@ class optimization_problem_interface_t { std::vector(qc.linear_values.begin(), qc.linear_values.end()), std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), static_cast(qc.rhs_value), - std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), - std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), - std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + std::vector(qc.rows.begin(), qc.rows.end()), + std::vector(qc.cols.begin(), qc.cols.end()), + std::vector(qc.vals.begin(), qc.vals.end())}); } set_quadratic_constraints(std::move(converted_constraints)); } diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp index 53ef6da4e5..f6599c4ea6 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp @@ -85,6 +85,10 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr n_nonzeros, A_offsets.data(), n_constraints + 1); + } else { + // Set empty constraint matrix + std::vector offsets(1, 0); + problem->set_csr_constraint_matrix(nullptr, 0, nullptr, 0, offsets.data(), 1); } } diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index e7604fb60e..0c92e83b86 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -31,9 +32,12 @@ #include #include +#include #include #include +#include +#include #include #include @@ -43,49 +47,160 @@ #include #include #include +#include namespace cuopt::linear_programming::dual_simplex { -// non-template wrappers to work around clang compiler bug -[[maybe_unused]] static void pairwise_multiply( - float* a, float* b, float* out, int size, rmm::cuda_stream_view stream) +template +bool validate_barrier_cone_layout(const lp_problem_t& problem, + const simplex_solver_settings_t& settings) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(a, b), out, size, cuda::std::multiplies<>{}, stream.value()); + if (problem.second_order_cone_dims.empty()) { return true; } + + i_t cone_end = problem.cone_var_start; + for (i_t q_k : problem.second_order_cone_dims) { + if (q_k <= 1) { + settings.log.printf( + "Error: second-order cone dimensions must be at least 2; use linear variables instead of " + "Q^1\n"); + return false; + } + cone_end += q_k; + } + + if (cone_end != problem.num_cols) { + settings.log.printf("Error: conic variables must form a trailing block [linear | cone]\n"); + return false; + } + + for (i_t j = problem.cone_var_start; j < cone_end; ++j) { + if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { + settings.log.printf("Error: explicit lower bound on conic variable %d is not supported\n", j); + return false; + } + if (problem.upper[j] < inf) { + settings.log.printf("Error: explicit upper bound on conic variable %d is not supported\n", j); + return false; + } + } + + return true; } +template [[maybe_unused]] static void pairwise_multiply( - double* a, double* b, double* out, int size, rmm::cuda_stream_view stream) + f_t* a, f_t* b, f_t* out, int size, rmm::cuda_stream_view stream) { cub::DeviceTransform::Transform( cuda::std::make_tuple(a, b), out, size, cuda::std::multiplies<>{}, stream.value()); } -[[maybe_unused]] static void axpy( - float alpha, float* x, float beta, float* y, float* out, int size, rmm::cuda_stream_view stream) +// out[i] = is_direct_free_linear[i] ? 0 : a[i] * b[i] +template +[[maybe_unused]] static void pairwise_multiply_skip_direct_free_linear( + f_t* a, f_t* b, int* is_direct_free_linear, f_t* out, int size, rmm::cuda_stream_view stream) { cub::DeviceTransform::Transform( - cuda::std::make_tuple(x, y), + cuda::std::make_tuple(a, b, is_direct_free_linear), out, size, - [alpha, beta] __host__ __device__(float a, float b) { return alpha * a + beta * b; }, + [] __host__ __device__(f_t x_j, f_t d_j, int free_j) { return free_j ? f_t{0} : x_j * d_j; }, stream.value()); } -[[maybe_unused]] static void axpy(double alpha, - double* x, - double beta, - double* y, - double* out, - int size, - rmm::cuda_stream_view stream) +template +[[maybe_unused]] static void axpy( + f_t alpha, f_t* x, f_t beta, f_t* y, f_t* out, int size, rmm::cuda_stream_view stream) { cub::DeviceTransform::Transform( cuda::std::make_tuple(x, y), out, size, - [alpha, beta] __host__ __device__(double a, double b) { return alpha * a + beta * b; }, + [alpha, beta] __host__ __device__(f_t a, f_t b) { return alpha * a + beta * b; }, + stream.value()); +} + +// step size computation for nonnegative and free variables +template +static f_t max_nonnegative_step_length_in_range( + transform_reduce_helper_t& transform_reduce_helper, + const rmm::device_uvector& x, + const rmm::device_uvector& dx, + i_t len, + const rmm::device_uvector& is_direct_free_linear, + bool apply_direct_free_mask, + rmm::cuda_stream_view stream) +{ + if (len <= 0) { return f_t(1); } + + return transform_reduce_helper.transform_reduce( + thrust::make_zip_iterator(dx.data(), x.data(), is_direct_free_linear.data()), + thrust::minimum{}, + [apply_direct_free_mask] HD(const thrust::tuple& t) { + const f_t dx_val = thrust::get<0>(t); + const f_t x_val = thrust::get<1>(t); + const i_t is_free = thrust::get<2>(t); + return (!(apply_direct_free_mask && is_free) && dx_val < f_t(0.0)) ? -x_val / dx_val + : f_t(1.0); + }, + f_t(1.0), + len, + stream); +} + +// Linear (orthant) block only; SOC uses recover_cone_dz_from_target. +template +static void recover_linear_orthant_dz(raft::device_span target, + raft::device_span z, + raft::device_span dx, + raft::device_span x, + raft::device_span dz, + raft::device_span is_direct_free_linear, + rmm::cuda_stream_view stream) +{ + if (dz.empty()) return; + + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + target.data(), z.data(), dx.data(), x.data(), is_direct_free_linear.data()), + dz.data(), + dz.size(), + [] HD(f_t target_val, f_t z_val, f_t dx_val, f_t x_val, i_t is_direct_free) { + if (is_direct_free) return f_t(0); + return target_val - (z_val * dx_val) / x_val; + }, stream.value()); + RAFT_CHECK_CUDA(stream); +} + +template +static void negate_complementarity_rhs(raft::device_span out, + raft::device_span residual, + rmm::cuda_stream_view stream) +{ + if (out.empty()) return; + cub::DeviceTransform::Transform( + residual.data(), out.data(), out.size(), [] HD(f_t rhs) { return -rhs; }, stream.value()); +} + +template +static void fill_linear_cc_rhs(raft::device_span out, + raft::device_span dx_aff, + raft::device_span dz_aff, + f_t new_mu, + raft::device_span is_direct_free_linear, + rmm::cuda_stream_view stream) +{ + if (out.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dx_aff.data(), dz_aff.data(), is_direct_free_linear.data()), + out.data(), + out.size(), + [new_mu] HD(f_t dx_aff_val, f_t dz_aff_val, i_t is_direct_free_linear) { + return is_direct_free_linear ? f_t(0) : (-(dx_aff_val * dz_aff_val) + new_mu); + }, + stream.value()); + RAFT_CHECK_CUDA(stream); } template @@ -93,7 +208,7 @@ class iteration_data_t { public: iteration_data_t(const lp_problem_t& lp, i_t num_upper_bounds, - const std::vector& free_variable_indices, + const std::vector& direct_free_variables, const csc_matrix_t& Qin, const simplex_solver_settings_t& settings) : upper_bounds(num_upper_bounds), @@ -164,10 +279,12 @@ class iteration_data_t { d_inv_diag(lp.num_cols, lp.handle_ptr->get_stream()), d_cols_to_remove(0, lp.handle_ptr->get_stream()), d_augmented_diagonal_indices_(0, lp.handle_ptr->get_stream()), + d_cone_csr_indices_(0, lp.handle_ptr->get_stream()), + d_cone_Q_values_(0, lp.handle_ptr->get_stream()), use_augmented(false), has_factorization(false), - n_free_vars(0), - d_is_free_(0, lp.handle_ptr->get_stream()), + n_direct_free_linear(0), + d_is_direct_free_linear_(0, lp.handle_ptr->get_stream()), num_factorizations(0), has_solve_info(false), settings_(settings), @@ -184,6 +301,8 @@ class iteration_data_t { d_tmp4_(lp.num_cols, lp.handle_ptr->get_stream()), d_r1_(lp.num_cols, lp.handle_ptr->get_stream()), d_r1_prime_(lp.num_cols, lp.handle_ptr->get_stream()), + d_augmented_rhs_(0, lp.handle_ptr->get_stream()), + d_augmented_soln_(0, lp.handle_ptr->get_stream()), d_c_(lp.num_cols, lp.handle_ptr->get_stream()), d_upper_(0, lp.handle_ptr->get_stream()), d_u_(lp.A.n, lp.handle_ptr->get_stream()), @@ -209,9 +328,11 @@ class iteration_data_t { d_dw_residual_(0, lp.handle_ptr->get_stream()), d_wv_residual_(0, lp.handle_ptr->get_stream()), d_bound_rhs_(0, lp.handle_ptr->get_stream()), - d_complementarity_xz_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), + d_complementarity_xz_rhs_(0, lp.handle_ptr->get_stream()), d_complementarity_wv_rhs_(0, lp.handle_ptr->get_stream()), d_dual_rhs_(lp.num_cols, lp.handle_ptr->get_stream()), + d_complementarity_target_(lp.num_cols, lp.handle_ptr->get_stream()), + d_cone_hessian_dx_(0, lp.handle_ptr->get_stream()), d_Q_diag_(0, lp.handle_ptr->get_stream()), d_Qx_(Qin.m, lp.handle_ptr->get_stream()), restrict_u_(0), @@ -219,192 +340,262 @@ class iteration_data_t { sum_reduce_helper_(lp.handle_ptr->get_stream()), indefinite_Q(false), Q_diagonal(false), - symbolic_status(0) + symbolic_status(0), + cone_combined_step_(false), + cone_sigma_mu_(f_t(0)) { raft::common::nvtx::range fun_scope("Barrier: LP Data Creation"); - // Set up free variable tracking for QPs - if (!free_variable_indices.empty()) { - n_free_vars = free_variable_indices.size(); - std::vector is_free_host(lp.num_cols, 0); - for (i_t j : free_variable_indices) { - is_free_host[j] = 1; + { + raft::common::nvtx::range scope("Barrier: LP Data: direct free linear"); + // Setup tracking of direct free variables (linear columns only j < cone_start) + n_direct_free_linear = direct_free_variables.size(); + std::vector is_direct_free_linear_host(lp.num_cols, 0); + for (i_t j : direct_free_variables) { + is_direct_free_linear_host[j] = 1; + } + d_is_direct_free_linear_.resize(lp.num_cols, stream_view_); + raft::copy(d_is_direct_free_linear_.data(), + is_direct_free_linear_host.data(), + lp.num_cols, + stream_view_); + if (n_direct_free_linear > 0) { + settings.log.printf("Free variables : %d\n", n_direct_free_linear); } - d_is_free_.resize(lp.num_cols, stream_view_); - raft::copy(d_is_free_.data(), is_free_host.data(), lp.num_cols, stream_view_); - settings.log.printf("Free variables (QP) : %d\n", n_free_vars); } bool has_Q = Q.x.size() > 0; indefinite_Q = false; - if (has_Q) { - Qdiag.resize(lp.num_cols, 0.0); - - for (i_t j = 0; j < Q.n; j++) { - const i_t col_start = Q.col_start[j]; - const i_t col_end = Q.col_start[j + 1]; - for (i_t p = col_start; p < col_end; p++) { - const i_t i = Q.i[p]; - if (j == i) { - Qdiag[j] = Q.x[p]; - break; + { + raft::common::nvtx::range scope("Barrier: LP Data: Q setup"); + if (has_Q) { + Qdiag.resize(lp.num_cols, 0.0); + + for (i_t j = 0; j < Q.n; j++) { + const i_t col_start = Q.col_start[j]; + const i_t col_end = Q.col_start[j + 1]; + for (i_t p = col_start; p < col_end; p++) { + const i_t i = Q.i[p]; + if (j == i) { + Qdiag[j] = Q.x[p]; + break; + } } } - } - Q_diagonal = Q.is_diagonal(); + Q_diagonal = Q.is_diagonal(); - if (Q_diagonal) { - // Check to ensure that Q is positive semi-definite - for (i_t j = 0; j < lp.num_cols; j++) { - if (Qdiag[j] < 0.0) { - settings_.log.printf( - "Q is not positive semidefinite: Q(%d, %d) = %e\n", j, j, Qdiag[j]); - indefinite_Q = true; - return; + if (Q_diagonal) { + // Check to ensure that Q is positive semi-definite + for (i_t j = 0; j < lp.num_cols; j++) { + if (Qdiag[j] < 0.0) { + settings_.log.printf( + "Q is not positive semidefinite: Q(%d, %d) = %e\n", j, j, Qdiag[j]); + indefinite_Q = true; + return; + } } + } else if (settings.check_Q) { + // TODO: Check to ensure that Q is positive semi-definite + // This requires us to perform a Cholesky factorization. + settings.log.printf( + "Warning: positive semidefiniteness check for general Q is not implemented yet.\n"); } - } else if (settings.check_Q) { - // TODO: Check to ensure that Q is positive semi-definite - // This requires us to perform a Cholesky factorization. + + d_Q_diag_.resize(lp.num_cols, stream_view_); + raft::copy(d_Q_diag_.data(), Qdiag.data(), Qdiag.size(), stream_view_); } + } - d_Q_diag_.resize(lp.num_cols, stream_view_); - raft::copy(d_Q_diag_.data(), Qdiag.data(), Qdiag.size(), stream_view_); + if (!lp.second_order_cone_dims.empty()) { + raft::common::nvtx::range scope("Barrier: LP Data: SOC setup"); + cone_var_start_ = lp.cone_var_start; + i_t total_cone_dim = + std::accumulate(lp.second_order_cone_dims.begin(), lp.second_order_cone_dims.end(), i_t(0)); + cuopt_assert(cone_var_start_ >= 0, "cone_var_start must be nonnegative"); + cuopt_assert(cone_var_start_ + total_cone_dim <= lp.num_cols, + "cone variables exceed problem dimension"); + cuopt_assert(cone_var_start_ + total_cone_dim == lp.num_cols, + "barrier expects [linear | cone] layout"); + cones_.emplace( + std::span(lp.second_order_cone_dims.data(), lp.second_order_cone_dims.size()), + raft::device_span{}, + raft::device_span{}, + stream_view_); + cuopt_assert(cone_count() > 0, "second-order cone topology must contain at least one cone"); + cuopt_assert(cone_entry_count() == total_cone_dim, "second-order cone entry count mismatch"); } - // Allocating GPU flag data for Form ADAT - RAFT_CUDA_TRY(cub::DeviceSelect::Flagged( - nullptr, - flag_buffer_size, - d_inv_diag_prime.data(), // Not the actual input but just to allcoate the memory - thrust::make_transform_iterator(d_cols_to_remove.data(), cuda::std::logical_not{}), - d_inv_diag_prime.data(), - d_num_flag.data(), - inv_diag.size(), - stream_view_)); + { + raft::common::nvtx::range scope("Barrier: LP Data: complementarity buffers"); + const i_t linear_xz_rhs_size = linear_xz_size(lp.num_cols); + d_complementarity_xz_rhs_.resize(linear_xz_rhs_size, stream_view_); - d_flag_buffer.resize(flag_buffer_size, stream_view_); + // Allocate GPU flag data for Form ADAT + RAFT_CUDA_TRY(cub::DeviceSelect::Flagged( + nullptr, + flag_buffer_size, + d_inv_diag_prime.data(), // Not the actual input but just to allcoate the memory + thrust::make_transform_iterator(d_cols_to_remove.data(), cuda::std::logical_not{}), + d_inv_diag_prime.data(), + d_num_flag.data(), + inv_diag.size(), + stream_view_)); - // Create the upper bounds vector - n_upper_bounds = 0; - for (i_t j = 0; j < lp.num_cols; j++) { - if (lp.upper[j] < inf) { upper_bounds[n_upper_bounds++] = j; } + d_flag_buffer.resize(flag_buffer_size, stream_view_); } - if (n_upper_bounds > 0) { - settings.log.printf("Upper bounds : %d\n", n_upper_bounds); + + { + raft::common::nvtx::range scope("Barrier: LP Data: upper bounds"); + // Create the upper bounds vector + n_upper_bounds = 0; + for (i_t j = 0; j < lp.num_cols; j++) { + if (lp.upper[j] < inf) { upper_bounds[n_upper_bounds++] = j; } + } + if (n_upper_bounds > 0) { + settings.log.printf("Upper bounds : %d\n", n_upper_bounds); + } } - // Decide if we are going to use the augmented system or not - n_dense_columns = 0; - i_t n_dense_rows = 0; - i_t max_row_nz = 0; - f_t estimated_nz_AAT = 0.0; std::vector dense_columns_unordered; + { + raft::common::nvtx::range scope("Barrier: LP Data: dense columns and augmented"); + // Decide if we are going to use the augmented system or not + n_dense_columns = 0; + i_t n_dense_rows = 0; + i_t max_row_nz = 0; + f_t estimated_nz_AAT = 0.0; + + const bool has_soc = has_cones(); + + if (has_soc) { + primal_perturb = 1e-8; + dual_perturb = 1e-8; + } else { + primal_perturb = 1e-6; + dual_perturb = 0; + } - f_t start_column_density = tic(); + if (has_soc) { + // SOCP always use the augmented KKT; skip dense-column / ADAT heuristics. + use_augmented = true; + n_dense_columns = 0; + } else { + f_t start_column_density = tic(); - // Do not look for dense columns if Q is not diagonal - if (!has_Q || Q_diagonal) { - find_dense_columns( - lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT); - } - if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } + // Do not look for dense columns if Q is not diagonal + if (!has_Q || Q_diagonal) { + find_dense_columns( + lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT); + } + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } #ifdef PRINT_INFO - for (i_t j : dense_columns_unordered) { - settings.log.printf("Dense column %6d\n", j); - } + for (i_t j : dense_columns_unordered) { + settings.log.printf("Dense column %6d\n", j); + } #endif - float64_t column_density_time = toc(start_column_density); - if (!settings.eliminate_dense_columns) { dense_columns_unordered.clear(); } - n_dense_columns = static_cast(dense_columns_unordered.size()); - if (n_dense_columns > 0) { - settings.log.printf("Dense columns : %d\n", n_dense_columns); - } - if (n_dense_rows > 0) { - settings.log.printf("Dense rows : %d\n", n_dense_rows); - } - settings.log.printf("Density estimator time : %.2fs\n", column_density_time); - if ((settings.augmented != 0) && - (n_dense_columns > 50 || n_dense_rows > 10 || - lp.A.m == 0 /* handle case with no constraints */ || - (max_row_nz > 5000 && estimated_nz_AAT > 1e10) || settings.augmented == 1)) { - use_augmented = true; - n_dense_columns = 0; - } + float64_t column_density_time = toc(start_column_density); + if (!settings.eliminate_dense_columns) { dense_columns_unordered.clear(); } + n_dense_columns = static_cast(dense_columns_unordered.size()); + if (n_dense_columns > 0) { + settings.log.printf("Dense columns : %d\n", n_dense_columns); + } + if (n_dense_rows > 0) { + settings.log.printf("Dense rows : %d\n", n_dense_rows); + } + settings.log.printf("Density estimator time : %.2fs\n", column_density_time); + if ((settings.augmented != 0) && + (n_dense_columns > 50 || n_dense_rows > 10 || + lp.A.m == 0 /* handle case with no constraints */ || + (max_row_nz > 5000 && estimated_nz_AAT > 1e10) || settings.augmented == 1)) { + use_augmented = true; + n_dense_columns = 0; + } + } - if (has_Q && !use_augmented) { - // For now let's not deal with dense columns - n_dense_columns = 0; - use_augmented = !Q_diagonal; - } + if (has_Q && !use_augmented) { + // For now let's not deal with dense columns + n_dense_columns = 0; + use_augmented = !Q_diagonal; + } - if (use_augmented) { - settings.log.printf("Linear system : augmented\n"); - } else { - settings.log.printf("Linear system : ADAT\n"); + if (use_augmented) { + settings.log.printf("Linear system : augmented\n"); + const i_t augmented_size = lp.num_cols + lp.num_rows; + d_augmented_rhs_.resize(augmented_size, stream_view_); + d_augmented_soln_.resize(augmented_size, stream_view_); + } else { + settings.log.printf("Linear system : ADAT\n"); + } } - // D = I + EET - diag.set_scalar(1.0); - if (n_upper_bounds > 0) { - for (i_t k = 0; k < n_upper_bounds; k++) { - i_t j = upper_bounds[k]; - diag[j] = 2.0; + { + raft::common::nvtx::range scope("Barrier: LP Data: diag and inv_diag"); + // D = I + EET + diag.set_scalar(1.0); + if (n_upper_bounds > 0) { + for (i_t k = 0; k < n_upper_bounds; k++) { + i_t j = upper_bounds[k]; + diag[j] = 2.0; + } } - } - // D = I + EET + Q (if Q is diagonal) - if (has_Q && !use_augmented) { - // this means that Q is diagonal - for (i_t j = 0; j < Q.n; j++) { - diag[j] += Qdiag[j]; + // D = I + EET + Q (if Q is diagonal) + if (has_Q && !use_augmented) { + // this means that Q is diagonal + for (i_t j = 0; j < Q.n; j++) { + diag[j] += Qdiag[j]; + } } - } - inv_diag.set_scalar(1.0); - if (use_augmented) { diag.multiply_scalar(-1.0); } - if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { diag.inverse(inv_diag); } - // TMP diag and inv_diag should directly created and filled on the GPU - raft::copy(d_inv_diag.data(), inv_diag.data(), inv_diag.size(), stream_view_); - inv_sqrt_diag.set_scalar(1.0); - if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { inv_diag.sqrt(inv_sqrt_diag); } + inv_diag.set_scalar(1.0); + if (use_augmented) { diag.multiply_scalar(-1.0); } + if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { diag.inverse(inv_diag); } + // TMP diag and inv_diag should directly created and filled on the GPU + raft::copy(d_inv_diag.data(), inv_diag.data(), inv_diag.size(), stream_view_); + inv_sqrt_diag.set_scalar(1.0); + if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { inv_diag.sqrt(inv_sqrt_diag); } + } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - // Copy A into AD - AD = lp.A; - if (!use_augmented && n_dense_columns > 0) { - cols_to_remove.resize(lp.num_cols, 0); - for (i_t k : dense_columns_unordered) { - cols_to_remove[k] = 1; - } - d_cols_to_remove.resize(cols_to_remove.size(), stream_view_); - raft::copy( - d_cols_to_remove.data(), cols_to_remove.data(), cols_to_remove.size(), stream_view_); - dense_columns.clear(); - dense_columns.reserve(n_dense_columns); - for (i_t j = 0; j < lp.num_cols; j++) { - if (cols_to_remove[j]) { dense_columns.push_back(j); } - } - AD.remove_columns(cols_to_remove); + { + raft::common::nvtx::range scope("Barrier: LP Data: AD matrix setup"); + // Copy A into AD + AD = lp.A; + if (!use_augmented && n_dense_columns > 0) { + cols_to_remove.resize(lp.num_cols, 0); + for (i_t k : dense_columns_unordered) { + cols_to_remove[k] = 1; + } + d_cols_to_remove.resize(cols_to_remove.size(), stream_view_); + raft::copy( + d_cols_to_remove.data(), cols_to_remove.data(), cols_to_remove.size(), stream_view_); + dense_columns.clear(); + dense_columns.reserve(n_dense_columns); + for (i_t j = 0; j < lp.num_cols; j++) { + if (cols_to_remove[j]) { dense_columns.push_back(j); } + } + AD.remove_columns(cols_to_remove); - sparse_mark.resize(lp.num_cols, 1); - for (i_t k : dense_columns) { - sparse_mark[k] = 0; - } + sparse_mark.resize(lp.num_cols, 1); + for (i_t k : dense_columns) { + sparse_mark[k] = 0; + } - A_dense.resize(AD.m, n_dense_columns); - i_t k = 0; - for (i_t j : dense_columns) { - A_dense.from_sparse(lp.A, j, k++); + A_dense.resize(AD.m, n_dense_columns); + i_t k = 0; + for (i_t j : dense_columns) { + A_dense.from_sparse(lp.A, j, k++); + } } - } - AD.transpose(AT); + AD.transpose(AT); + } // device_AD / device_A / ADAT path is only used when forming ADAT (!use_augmented). if (!use_augmented) { + raft::common::nvtx::range scope("Barrier: LP Data: device AD path"); device_AD.copy(AD, handle_ptr->get_stream()); d_original_A_values.resize(device_AD.x.size(), handle_ptr->get_stream()); raft::copy(d_original_A_values.data(), @@ -421,108 +612,273 @@ class iteration_data_t { } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows; - chol = - std::make_unique>(handle_ptr, settings, factorization_size); - chol->set_positive_definite(false); + { + raft::common::nvtx::range scope("Barrier: LP Data: Cholesky init"); + i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows; + chol = std::make_unique>( + handle_ptr, settings, factorization_size); + chol->set_positive_definite(false); + } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - // Perform symbolic analysis - symbolic_status = 0; - if (use_augmented) { - // Build the sparsity pattern of the augmented system - form_augmented(true); - if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - symbolic_status = chol->analyze(device_augmented); - } else { - form_adat(true); - if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - symbolic_status = chol->analyze(device_ADAT); + { + raft::common::nvtx::range scope("Barrier: LP Data: symbolic analysis"); + // Perform symbolic analysis + symbolic_status = 0; + if (use_augmented) { + { + raft::common::nvtx::range form_scope("Barrier: LP Data: form augmented"); + // Build the sparsity pattern of the augmented system + form_augmented(true); + } + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } + symbolic_status = chol->analyze(device_augmented); + } else { + { + raft::common::nvtx::range form_scope("Barrier: LP Data: form ADAT"); + form_adat(true); + } + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } + symbolic_status = chol->analyze(device_ADAT); + } + } + } + + bool has_cones() const { return cones_.has_value(); } + + cone_data_t& cones() + { + cuopt_assert(cones_.has_value(), "second-order cone data is not initialized"); + return *cones_; + } + + const cone_data_t& cones() const + { + cuopt_assert(cones_.has_value(), "second-order cone data is not initialized"); + return *cones_; + } + + i_t cone_count() const { return has_cones() ? cones_->n_cones : i_t(0); } + + i_t cone_entry_count() const + { + return has_cones() ? static_cast(cones_->n_cone_entries) : i_t(0); + } + + i_t cone_start() const { return cone_var_start_; } + + i_t cone_end() const { return cone_start() + cone_entry_count(); } + + i_t linear_xz_size(std::size_t full_xz_size) const + { + return has_cones() ? cone_start() : static_cast(full_xz_size); + } + + bool is_cone_variable(i_t variable) const + { + return has_cones() && variable >= cone_start() && variable < cone_end(); + } + + f_t complementarity_degree(std::size_t num_primal_variables, i_t num_upper_bounds) const + { + const bool has_soc = has_cones(); + f_t degree = static_cast(num_primal_variables) + static_cast(num_upper_bounds); + // Direct QP free variables (linear only): no x·z complementarity in the barrier degree. + degree -= static_cast(n_direct_free_linear); + if (has_soc) { + degree -= static_cast(cone_entry_count()); + degree += static_cast(cone_count()); } + return degree; } void form_augmented(bool first_call = false) { - i_t n = A.n; - i_t m = A.m; - i_t nnzA = A.col_start[n]; - i_t nnzQ = Q.n > 0 ? Q.col_start[n] : 0; - i_t factorization_size = n + m; - const f_t dual_perturb = 0.0; - const f_t primal_perturb = 1e-6; + i_t n = A.n; + i_t m = A.m; + i_t nnzA = A.col_start[n]; + i_t nnzQ = Q.n > 0 ? Q.col_start[n] : 0; + i_t factorization_size = n + m; + + const bool has_soc = has_cones(); + const i_t m_c = cone_entry_count(); + i_t total_block_nnz = 0; + + std::vector cone_offsets_host; + std::vector cone_block_offsets_host; + std::vector local_to_cone; + if (first_call) { - i_t new_nnz = 2 * nnzA + n + m + nnzQ; + if (has_soc) { + raft::common::nvtx::range scope("Barrier: augmented: SOC offsets"); + // Initialize the cone block offsets + const i_t n_cones = cone_count(); + cone_offsets_host.resize(n_cones + 1); + cone_block_offsets_host.resize(n_cones + 1); + raft::copy( + cone_offsets_host.data(), cones().cone_offsets.data(), n_cones + 1, stream_view_); + handle_ptr->sync_stream(); + for (i_t k = 0; k < n_cones; ++k) { + const auto q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; + cone_block_offsets_host[k + 1] = cone_block_offsets_host[k] + q_k * q_k; + } + total_block_nnz = static_cast(cone_block_offsets_host[n_cones]); + + // Precompute: for each local cone entry, which cone does it belong to? + local_to_cone.resize(m_c); + for (i_t k = 0; k < n_cones; ++k) { + const i_t lo = cone_offsets_host[k]; + const i_t hi = cone_offsets_host[k + 1]; + for (i_t p = lo; p < hi; p++) { + local_to_cone[p] = k; + } + } + } + + i_t new_nnz = 2 * nnzA + n + m + nnzQ + total_block_nnz; // conservative estimate of nnz csr_matrix_t augmented_CSR(n + m, n + m, new_nnz); std::vector augmented_diagonal_indices(n + m, -1); + std::vector cone_csr_indices_host(total_block_nnz, -1); + std::vector cone_Q_values_host(total_block_nnz, f_t(0)); i_t q = 0; i_t off_diag_Qnz = 0; - for (i_t i = 0; i < n; i++) { - augmented_CSR.row_start[i] = q; - if (nnzQ == 0) { - augmented_diagonal_indices[i] = q; - augmented_CSR.j[q] = i; - augmented_CSR.x[q++] = -diag[i] - dual_perturb; - } else { - // Q is symmetric - const i_t q_col_beg = Q.col_start[i]; - const i_t q_col_end = Q.col_start[i + 1]; - bool has_diagonal = false; - for (i_t p = q_col_beg; p < q_col_end; ++p) { - augmented_CSR.j[q] = Q.i[p]; - if (Q.i[p] == i) { - has_diagonal = true; - augmented_diagonal_indices[i] = q; - augmented_CSR.x[q++] = -Q.x[p] - diag[i] - dual_perturb; - } else { + { + raft::common::nvtx::range scope("Barrier: augmented: host CSR build"); + for (i_t i = 0; i < n; i++) { + augmented_CSR.row_start[i] = q; + + const bool is_cone_row = is_cone_variable(i); + + if (is_cone_row) { + // Determine which cone this variable belongs to and its local row + i_t local_idx = i - cone_start(); + i_t k = local_to_cone[local_idx]; + i_t local_r = + static_cast(static_cast(local_idx) - cone_offsets_host[k]); + i_t q_k = static_cast(cone_offsets_host[k + 1] - cone_offsets_host[k]); + i_t cone_col_start = cone_start() + static_cast(cone_offsets_host[k]); + i_t block_base = static_cast(cone_block_offsets_host[k]) + local_r * q_k; + + // Merge-join: Q entries (sorted) with dense cone block columns (contiguous) + i_t qp = (nnzQ > 0) ? Q.col_start[i] : 0; + i_t q_end = (nnzQ > 0) ? Q.col_start[i + 1] : 0; + + // Q entries before cone block + for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; off_diag_Qnz++; - augmented_CSR.x[q++] = -Q.x[p]; } - } - if (!has_diagonal) { + + // Dense cone block, absorbing any Q entries that fall inside + for (i_t c = 0; c < q_k; c++) { + i_t col = cone_col_start + c; + f_t q_contrib = f_t(0); + f_t initial_val = (c == local_r) ? f_t(-dual_perturb) + : f_t(0); // diagonal entry of the cone block column + + if (qp < q_end && Q.i[qp] == col) { + q_contrib = Q.x[qp]; + qp++; + } + + cone_csr_indices_host[block_base + c] = q; + cone_Q_values_host[block_base + c] = q_contrib; + if (col == i) { augmented_diagonal_indices[i] = q; } + augmented_CSR.j[q] = col; + augmented_CSR.x[q++] = initial_val - q_contrib; + } + + // Q entries after cone block + for (; qp < q_end; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } + } else if (nnzQ == 0) { augmented_diagonal_indices[i] = q; augmented_CSR.j[q] = i; augmented_CSR.x[q++] = -diag[i] - dual_perturb; + } else { + // Q is symmetric + const i_t q_col_beg = Q.col_start[i]; + const i_t q_col_end = Q.col_start[i + 1]; + bool has_diagonal = false; + for (i_t p = q_col_beg; p < q_col_end; ++p) { + augmented_CSR.j[q] = Q.i[p]; + if (Q.i[p] == i) { + has_diagonal = true; + augmented_diagonal_indices[i] = q; + augmented_CSR.x[q++] = -Q.x[p] - diag[i] - dual_perturb; + } else { + off_diag_Qnz++; + augmented_CSR.x[q++] = -Q.x[p]; + } + } + if (!has_diagonal) { + augmented_diagonal_indices[i] = q; + augmented_CSR.j[q] = i; + augmented_CSR.x[q++] = -diag[i] - dual_perturb; + } + } + // AT block, we can use A in csc directly + const i_t col_beg = A.col_start[i]; + const i_t col_end = A.col_start[i + 1]; + for (i_t p = col_beg; p < col_end; ++p) { + augmented_CSR.j[q] = A.i[p] + n; + augmented_CSR.x[q++] = A.x[p]; } } - // AT block, we can use A in csc directly - const i_t col_beg = A.col_start[i]; - const i_t col_end = A.col_start[i + 1]; - for (i_t p = col_beg; p < col_end; ++p) { - augmented_CSR.j[q] = A.i[p] + n; - augmented_CSR.x[q++] = A.x[p]; + + for (i_t k = n; k < n + m; ++k) { + // A block, we can use AT in csc directly + augmented_CSR.row_start[k] = q; + const i_t l = k - n; + const i_t col_beg = AT.col_start[l]; + const i_t col_end = AT.col_start[l + 1]; + for (i_t p = col_beg; p < col_end; ++p) { + augmented_CSR.j[q] = AT.i[p]; + augmented_CSR.x[q++] = AT.x[p]; + } + augmented_diagonal_indices[k] = q; + augmented_CSR.j[q] = k; + augmented_CSR.x[q++] = primal_perturb; } + augmented_CSR.row_start[n + m] = q; + augmented_CSR.nz_max = q; + augmented_CSR.j.resize(q); + augmented_CSR.x.resize(q); + i_t expected_nnz = 2 * nnzA + (n - m_c) + total_block_nnz + m + off_diag_Qnz; + settings_.log.debug("augmented nz %d predicted %d\n", q, expected_nnz); + cuopt_assert(q == expected_nnz, "augmented nnz != predicted"); + cuopt_assert(A.col_start[n] == AT.col_start[m], "A nz != AT nz"); } - for (i_t k = n; k < n + m; ++k) { - // A block, we can use AT in csc directly - augmented_CSR.row_start[k] = q; - const i_t l = k - n; - const i_t col_beg = AT.col_start[l]; - const i_t col_end = AT.col_start[l + 1]; - for (i_t p = col_beg; p < col_end; ++p) { - augmented_CSR.j[q] = AT.i[p]; - augmented_CSR.x[q++] = AT.x[p]; + { + raft::common::nvtx::range scope("Barrier: augmented: device upload"); + device_augmented.copy(augmented_CSR, handle_ptr->get_stream()); + d_augmented_diagonal_indices_.resize(augmented_diagonal_indices.size(), + handle_ptr->get_stream()); + raft::copy(d_augmented_diagonal_indices_.data(), + augmented_diagonal_indices.data(), + augmented_diagonal_indices.size(), + handle_ptr->get_stream()); + + if (has_soc) { + d_cone_csr_indices_.resize(total_block_nnz, handle_ptr->get_stream()); + raft::copy(d_cone_csr_indices_.data(), + cone_csr_indices_host.data(), + total_block_nnz, + handle_ptr->get_stream()); + d_cone_Q_values_.resize(total_block_nnz, handle_ptr->get_stream()); + raft::copy(d_cone_Q_values_.data(), + cone_Q_values_host.data(), + total_block_nnz, + handle_ptr->get_stream()); } - augmented_diagonal_indices[k] = q; - augmented_CSR.j[q] = k; - augmented_CSR.x[q++] = primal_perturb; - } - augmented_CSR.row_start[n + m] = q; - augmented_CSR.nz_max = q; - augmented_CSR.j.resize(q); - augmented_CSR.x.resize(q); - settings_.log.debug("augmented nz %d predicted %d\n", q, off_diag_Qnz + nnzA + n); - cuopt_assert(q == 2 * nnzA + n + m + off_diag_Qnz, "augmented nnz != predicted"); - cuopt_assert(A.col_start[n] == AT.col_start[m], "A nz != AT nz"); - - device_augmented.copy(augmented_CSR, handle_ptr->get_stream()); - d_augmented_diagonal_indices_.resize(augmented_diagonal_indices.size(), - handle_ptr->get_stream()); - raft::copy(d_augmented_diagonal_indices_.data(), - augmented_diagonal_indices.data(), - augmented_diagonal_indices.size(), - handle_ptr->get_stream()); - handle_ptr->sync_stream(); + + handle_ptr->sync_stream(); + } #ifdef CHECK_SYMMETRY csc_matrix_t augmented_transpose(1, 1, 1); augmented.transpose(augmented_transpose); @@ -537,20 +893,26 @@ class iteration_data_t { cuopt_assert(error.norm1() <= 1e-2, "|| Aug - Aug^T ||_1 > 1e-2"); #endif } else { + const i_t linear_n = has_soc ? cone_start() : n; + + // Refactor: update linear primal diagonals (j < cone_start() for SOCP) with + // -q_diag - d_j - dual_perturb. Cone Hessian block is overwritten by scatter when has_soc. + // Direct-free linear vars: d_j = 0 here and D·x = 0 in augmented_multiply so the Q/D part + // of the diagonal matches the matvec (-q_diag); dual_perturb remains factorization-only. thrust::for_each_n(rmm::exec_policy(handle_ptr->get_stream()), thrust::make_counting_iterator(0), - i_t(n), + linear_n, [span_x = cuopt::make_span(device_augmented.x), span_diag_indices = cuopt::make_span(d_augmented_diagonal_indices_), span_q_diag = cuopt::make_span(d_Q_diag_), span_diag = cuopt::make_span(d_diag_), dual_perturb_value = dual_perturb] __device__(i_t j) { - f_t q_diag = span_q_diag.size() > 0 ? span_q_diag[j] : 0.0; - span_x[span_diag_indices[j]] = - -q_diag - span_diag[j] - dual_perturb_value; + f_t q_diag = span_q_diag.size() > 0 ? span_q_diag[j] : 0.0; + const f_t d_j = span_diag[j]; + span_x[span_diag_indices[j]] = -q_diag - d_j - dual_perturb_value; }); - RAFT_CHECK_CUDA(handle_ptr->get_stream()); + thrust::for_each_n(rmm::exec_policy(handle_ptr->get_stream()), thrust::make_counting_iterator(n), i_t(m), @@ -560,6 +922,16 @@ class iteration_data_t { span_x[span_diag_indices[j]] = primal_perturb_value; }); RAFT_CHECK_CUDA(handle_ptr->get_stream()); + + if (has_soc) { + scatter_hessian_into_augmented(cones(), + device_augmented.x, + d_cone_csr_indices_, + d_cone_Q_values_, + handle_ptr->get_stream(), + dual_perturb); + RAFT_CHECK_CUDA(handle_ptr->get_stream()); + } } } @@ -570,43 +942,53 @@ class iteration_data_t { float64_t start_form_adat = tic(); const i_t m = AD.m; - raft::copy(device_AD.x.data(), - d_original_A_values.data(), - d_original_A_values.size(), - handle_ptr->get_stream()); - if (n_dense_columns > 0) { - // Adjust inv_diag - d_inv_diag_prime.resize(AD.n, stream_view_); - // Copy If - cub::DeviceSelect::Flagged( - d_flag_buffer.data(), - flag_buffer_size, - d_inv_diag.data(), - thrust::make_transform_iterator(d_cols_to_remove.data(), cuda::std::logical_not{}), - d_inv_diag_prime.data(), - d_num_flag.data(), - d_inv_diag.size(), - stream_view_); - RAFT_CHECK_CUDA(stream_view_); - } else { - d_inv_diag_prime.resize(inv_diag.size(), stream_view_); - raft::copy(d_inv_diag_prime.data(), d_inv_diag.data(), inv_diag.size(), stream_view_); + { + raft::common::nvtx::range scope("Barrier: Form ADAT: restore A"); + raft::copy(device_AD.x.data(), + d_original_A_values.data(), + d_original_A_values.size(), + handle_ptr->get_stream()); + } + { + raft::common::nvtx::range scope("Barrier: Form ADAT: inv_diag prime"); + if (n_dense_columns > 0) { + // Adjust inv_diag + d_inv_diag_prime.resize(AD.n, stream_view_); + // Copy If + cub::DeviceSelect::Flagged( + d_flag_buffer.data(), + flag_buffer_size, + d_inv_diag.data(), + thrust::make_transform_iterator(d_cols_to_remove.data(), cuda::std::logical_not{}), + d_inv_diag_prime.data(), + d_num_flag.data(), + d_inv_diag.size(), + stream_view_); + RAFT_CHECK_CUDA(stream_view_); + } else { + d_inv_diag_prime.resize(inv_diag.size(), stream_view_); + raft::copy(d_inv_diag_prime.data(), d_inv_diag.data(), inv_diag.size(), stream_view_); + } } cuopt_assert(static_cast(d_inv_diag_prime.size()) == AD.n, "inv_diag_prime.size() != AD.n"); - thrust::for_each_n(rmm::exec_policy(stream_view_), - thrust::make_counting_iterator(0), - i_t(device_AD.x.size()), - [span_x = cuopt::make_span(device_AD.x), - span_scale = cuopt::make_span(d_inv_diag_prime), - span_col_ind = cuopt::make_span(device_AD.col_index)] __device__(i_t i) { - span_x[i] *= span_scale[span_col_ind[i]]; - }); - RAFT_CHECK_CUDA(stream_view_); + { + raft::common::nvtx::range scope("Barrier: Form ADAT: scale AD"); + thrust::for_each_n(rmm::exec_policy(stream_view_), + thrust::make_counting_iterator(0), + i_t(device_AD.x.size()), + [span_x = cuopt::make_span(device_AD.x), + span_scale = cuopt::make_span(d_inv_diag_prime), + span_col_ind = cuopt::make_span(device_AD.col_index)] __device__(i_t i) { + span_x[i] *= span_scale[span_col_ind[i]]; + }); + RAFT_CHECK_CUDA(stream_view_); + } if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } if (first_call) { + raft::common::nvtx::range scope("Barrier: Form ADAT: cusparse init"); try { initialize_cusparse_data( handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); @@ -617,8 +999,11 @@ class iteration_data_t { } if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } - multiply_kernels(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); - handle_ptr->sync_stream(); + { + raft::common::nvtx::range scope("Barrier: Form ADAT: ADAT multiply"); + multiply_kernels(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); + handle_ptr->sync_stream(); + } auto adat_nnz = device_ADAT.row_start.element(device_ADAT.m, handle_ptr->get_stream()); float64_t adat_time = toc(start_form_adat); @@ -1436,8 +1821,9 @@ class iteration_data_t { f_t beta, rmm::device_uvector& y) { - const i_t m = A.m; - const i_t n = A.n; + const i_t m = A.m; + const i_t n = A.n; + const bool has_soc = has_cones(); rmm::device_uvector d_x1(n, handle_ptr->get_stream()); rmm::device_uvector d_x2(m, handle_ptr->get_stream()); @@ -1452,19 +1838,36 @@ class iteration_data_t { // y1 <- alpha ( -D * x_1 + A^T x_2) + beta * y1 rmm::device_uvector d_r1(n, handle_ptr->get_stream()); + thrust::fill_n(rmm::exec_policy(stream_view_), d_r1.begin(), n, f_t(0)); + + // r1 <- D * x_1 on linear indices; barrier D is zero on direct free variables + const i_t linear_n = has_soc ? cone_start() : n; + pairwise_multiply_skip_direct_free_linear(d_x1.data(), + d_diag_.data(), + d_is_direct_free_linear_.data(), + d_r1.data(), + linear_n, + stream_view_); + RAFT_CHECK_CUDA(stream_view_); - // diag.pairwise_product(x1, r1); - // r1 <- D * x_1 - pairwise_multiply(d_x1.data(), d_diag_.data(), d_r1.data(), n, stream_view_); + // r1 <- D * x_1 + H x_1 (cone Hessian block H = S^T S; accumulate_cone_hessian_matvec) + if (has_soc) { + const i_t m_c = cone_entry_count(); + accumulate_cone_hessian_matvec(raft::device_span(d_x1.data() + cone_start(), m_c), + cones(), + raft::device_span(d_r1.data() + cone_start(), m_c), + stream_view_); + RAFT_CHECK_CUDA(stream_view_); + } - // r1 <- Q x1 + D x1 + // r1 <- Q x1 + D x1 + H x1 (cone: same H as above) if (Q.n > 0) { // matrix_vector_multiply(Q, 1.0, x1, 1.0, r1); cusparse_Q_view_.spmv(1.0, d_x1, 1.0, d_r1); } // y1 <- - alpha * r1 + beta * y1 - // y1.axpy(-alpha, r1, beta); + // flip the sign of r1 = (Q x1 + D x1 + H x1) axpy(-alpha, d_r1.data(), beta, d_y1.data(), d_y1.data(), n, stream_view_); // matrix_transpose_vector_multiply(A, alpha, x2, 1.0, y1); @@ -1554,13 +1957,23 @@ class iteration_data_t { std::vector Qdiag; bool Q_diagonal; rmm::device_uvector d_augmented_diagonal_indices_; + rmm::device_uvector d_cone_csr_indices_; + rmm::device_uvector d_cone_Q_values_; bool indefinite_Q; cusparse_view_t cusparse_Q_view_; + std::optional> cones_; + i_t cone_var_start_ = 0; + bool use_augmented; i_t symbolic_status; - i_t n_free_vars{0}; - rmm::device_uvector d_is_free_; // 1 if variable is free (QP only), 0 otherwise + i_t n_direct_free_linear{0}; + rmm::device_uvector + d_is_direct_free_linear_; // 1 if variable is free in the linear block, else 0 + + // Adaptive regularization for the augmented system + f_t dual_perturb{1e-8}; + f_t primal_perturb{1e-8}; std::unique_ptr> chol; @@ -1623,6 +2036,8 @@ class iteration_data_t { rmm::device_uvector d_tmp4_; rmm::device_uvector d_r1_; rmm::device_uvector d_r1_prime_; + rmm::device_uvector d_augmented_rhs_; + rmm::device_uvector d_augmented_soln_; rmm::device_uvector d_c_; rmm::device_uvector d_upper_; rmm::device_uvector d_u_; @@ -1656,6 +2071,8 @@ class iteration_data_t { rmm::device_uvector d_complementarity_xz_rhs_; rmm::device_uvector d_complementarity_wv_rhs_; rmm::device_uvector d_dual_rhs_; + rmm::device_uvector d_complementarity_target_; + rmm::device_uvector d_cone_hessian_dx_; rmm::device_uvector d_Q_diag_; rmm::device_uvector d_Qx_; @@ -1665,6 +2082,9 @@ class iteration_data_t { transform_reduce_helper_t transform_reduce_helper_; sum_reduce_helper_t sum_reduce_helper_; + bool cone_combined_step_; + f_t cone_sigma_mu_; + rmm::cuda_stream_view stream_view_; const simplex_solver_settings_t& settings_; @@ -1762,7 +2182,54 @@ template int barrier_solver_t::initial_point(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: initial_point"); - const bool use_augmented = data.use_augmented; + const bool use_augmented = data.use_augmented; + const bool has_direct_free_linear = data.n_direct_free_linear > 0; + + // SOCP: data-dependent initial point following SeDuMi (Sturm, 1999). + // mu = sqrt((1 + ||b||_inf) * (1 + ||c||_inf)) + // primal and dual: x = mu * e_K, z = mu * e_K + // where e_K is the identity of the symmetric cone: + // LP block: e = 1, SOC block: e = (sqrt(2), 0, ..., 0) + if (data.has_cones()) { + const i_t cs = data.cone_start(); + const f_t norm_b = vector_norm_inf(lp.rhs); + const f_t norm_c = vector_norm_inf(lp.objective); + const f_t mu = std::sqrt((1.0 + norm_b) * (1.0 + norm_c)); + const f_t sqrt2 = std::sqrt(2.0); + const f_t x_soc = mu * sqrt2; + const f_t z_soc = mu * sqrt2; + // Linear orthant + for (i_t j = 0; j < cs; ++j) { + data.x[j] = mu; + data.z[j] = mu; + } + if (has_direct_free_linear) { + for (i_t j : presolve_info.direct_free_variables) { + if (j < cs) { data.z[j] = 0.0; } + } + } + // SOC blocks + i_t off = 0; + for (size_t k = 0; k < lp.second_order_cone_dims.size(); k++) { + i_t q_k = lp.second_order_cone_dims[k]; + data.x[cs + off] = x_soc; + data.z[cs + off] = z_soc; + for (i_t j = 1; j < q_k; ++j) { + data.x[cs + off + j] = 0.0; + data.z[cs + off + j] = 0.0; + } + off += q_k; + } + data.y.set_scalar(0.0); + if (data.n_upper_bounds > 0) { + data.w.set_scalar(mu); + data.v.set_scalar(mu); + } + return 0; + } + + // Mask used by the two ADAT/augmented branches below to enforce z > 0. + std::vector nonnegative_z(lp.num_cols, 1); // Perform a numerical factorization i_t status; @@ -1890,8 +2357,8 @@ int barrier_solver_t::initial_point(iteration_data_t& data) #endif } - dense_vector_t dual_res(lp.num_cols); float64_t epsilon_adjust = 10.0; + if (settings.barrier_dual_initial_point == -1 || settings.barrier_dual_initial_point == 0) { // Use the dual starting point suggested by the paper // On Implementing Mehrotra’s Predictor–Corrector Interior-Point Method for Linear Programming @@ -1924,7 +2391,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v[k] = -c[j] + epsilon; } } - // Now hande the case with no upper bounds + // Now handle the case with no upper bounds for (i_t j = 0; j < lp.num_cols; j++) { if (lp.upper[j] == inf) { if (c[j] > epsilon_adjust) { @@ -1935,8 +2402,8 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } } // Free variables have z = 0 (no complementarity condition) - if (data.n_free_vars > 0) { - for (i_t j : presolve_info.free_variable_indices) { + if (has_direct_free_linear) { + for (i_t j : presolve_info.direct_free_variables) { data.z[j] = 0.0; } } @@ -1960,7 +2427,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive(epsilon_adjust); + data.z.ensure_positive(epsilon_adjust, nonnegative_z); } else { // First compute rhs = A*Dinv*c dense_vector_t rhs(lp.num_rows); @@ -1984,7 +2451,7 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.gather_upper_bounds(data.z, data.v); data.v.multiply_scalar(-1.0); data.v.ensure_positive(epsilon_adjust); - data.z.ensure_positive(epsilon_adjust); + data.z.ensure_positive(epsilon_adjust, nonnegative_z); } // Verify A'*y + z - E*v - Q*x = c @@ -2001,24 +2468,21 @@ int barrier_solver_t::initial_point(iteration_data_t& data) settings.log.printf("||A^T y + z - E*v - Q*x - c ||: %e\n", vector_norm2(data.dual_residual)); #endif - // Make sure (w, x, v, z) > 0 - if (data.n_free_vars > 0) { - std::vector nonnegative_variables(data.x.size(), 1); - for (i_t j : presolve_info.free_variable_indices) { + // Make sure (w, x, v, z) > 0. Skip free variables being handled directly. + data.w.ensure_positive(epsilon_adjust); + std::vector nonnegative_variables(data.x.size(), 1); + if (has_direct_free_linear) { + for (i_t j : presolve_info.direct_free_variables) { nonnegative_variables[j] = 0; } - - data.x.ensure_positive(epsilon_adjust, nonnegative_variables); - - for (i_t j : presolve_info.free_variable_indices) { + } + data.x.ensure_positive(epsilon_adjust, nonnegative_variables); + // Direct free variables: reduced cost z = 0 (no complementarity condition). + if (has_direct_free_linear) { + for (i_t j : presolve_info.direct_free_variables) { data.z[j] = 0.0; } - - } else { - data.x.ensure_positive(epsilon_adjust); } - data.w.ensure_positive(epsilon_adjust); - #ifdef PRINT_INFO settings.log.printf("min v %e min z %e\n", data.v.minimum(), data.z.minimum()); #endif @@ -2202,53 +2666,45 @@ void barrier_solver_t::gpu_compute_residual_norms(const rmm::device_uv primal_residual_norm = std::max(device_vector_norm_inf(data.d_primal_residual_, stream_view_), device_vector_norm_inf(data.d_bound_residual_, stream_view_)); - dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); + dual_residual_norm = device_vector_norm_inf(data.d_dual_residual_, stream_view_); + const bool has_soc = data.has_cones(); + const i_t linear_xz_size = data.linear_xz_size(data.d_complementarity_xz_residual_.size()); + auto linear_xz_span = + raft::device_span(data.d_complementarity_xz_residual_.data(), linear_xz_size); complementarity_residual_norm = - std::max(device_vector_norm_inf(data.d_complementarity_xz_residual_, stream_view_), + std::max(device_vector_norm_inf(linear_xz_span, stream_view_), device_vector_norm_inf(data.d_complementarity_wv_residual_, stream_view_)); + if (has_soc) { + f_t cone_complementarity_norm = f_t(0); + raft::device_span cone_dot = data.cones().scratch.template get_slot<0>(); + data.cones().segmented_sum( + data.d_complementarity_xz_residual_.data() + data.cone_start(), cone_dot, stream_view_); + cone_complementarity_norm = thrust::reduce(rmm::exec_policy(stream_view_), + cone_dot.begin(), + cone_dot.end(), + f_t(0), + thrust::maximum()); + complementarity_residual_norm = + std::max(complementarity_residual_norm, cone_complementarity_norm); + } } template -f_t barrier_solver_t::gpu_max_step_to_boundary(iteration_data_t& data, - const rmm::device_uvector& x, - const rmm::device_uvector& dx) +f_t barrier_solver_t::compute_nonnegative_step_length(iteration_data_t& data, + const rmm::device_uvector& x, + const rmm::device_uvector& dx) { - // For x-sized vectors with free variables, skip free vars in ratio test - const bool has_free = data.n_free_vars > 0 && static_cast(x.size()) == lp.num_cols; - - if (has_free) { - auto is_free_ptr = data.d_is_free_.data(); - auto ratio_test_free = [is_free_ptr] HD(const thrust::tuple t) { - const f_t dx_val = thrust::get<0>(t); - const f_t x_val = thrust::get<1>(t); - const i_t is_free = thrust::get<2>(t); - if (is_free) return f_t(1.0); - if (dx_val < f_t(0.0)) return -x_val / dx_val; - return f_t(1.0); - }; - - return data.transform_reduce_helper_.transform_reduce( - thrust::make_zip_iterator(dx.data(), x.data(), is_free_ptr), - thrust::minimum(), - ratio_test_free, - f_t(1.0), - x.size(), - stream_view_); - } - - return data.transform_reduce_helper_.transform_reduce( - thrust::make_zip_iterator(dx.data(), x.data()), - thrust::minimum(), - [] HD(const thrust::tuple t) { - const f_t dx = thrust::get<0>(t); - const f_t x = thrust::get<1>(t); - - if (dx < f_t(0.0)) return -x / dx; - return f_t(1.0); - }, - f_t(1.0), - x.size(), - stream_view_); + const bool has_soc = data.has_cones() && static_cast(x.size()) >= data.cone_end(); + + // SOCP layout is [linear | cone]; stop at cone_start() + const i_t linear_len = has_soc ? data.cone_start() : static_cast(x.size()); + return max_nonnegative_step_length_in_range(data.transform_reduce_helper_, + x, + dx, + linear_len, + data.d_is_direct_free_linear_, + static_cast(x.size()) == lp.num_cols, + stream_view_); } template @@ -2258,100 +2714,80 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& dy, pinned_dense_vector_t& dv, pinned_dense_vector_t& dz, + f_t& dual_perturb, + f_t& primal_perturb, f_t& max_residual) { raft::common::nvtx::range fun_scope("Barrier: compute_search_direction"); - const bool debug = false; - const bool use_augmented = data.use_augmented; - - { - raft::common::nvtx::range fun_scope("Barrier: GPU allocation and copies"); + const bool debug = false; + const bool use_augmented = data.use_augmented; + const bool has_soc = data.has_cones(); + const bool has_direct_free_linear = data.n_direct_free_linear > 0; + const i_t m_c = data.cone_entry_count(); + const i_t cone_var_start = data.cone_start(); + const i_t linear_size = data.linear_xz_size(lp.num_cols); - // TMP allocation and copy should happen only once where it's written in a first place - data.d_bound_rhs_.resize(data.bound_rhs.size(), stream_view_); - raft::copy( - data.d_bound_rhs_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream_view_); - data.d_x_.resize(data.x.size(), stream_view_); - raft::copy(data.d_x_.data(), data.x.data(), data.x.size(), stream_view_); - data.d_z_.resize(data.z.size(), stream_view_); - raft::copy(data.d_z_.data(), data.z.data(), data.z.size(), stream_view_); - data.d_w_.resize(data.w.size(), stream_view_); - raft::copy(data.d_w_.data(), data.w.data(), data.w.size(), stream_view_); - data.d_v_.resize(data.v.size(), stream_view_); - raft::copy(data.d_v_.data(), data.v.data(), data.v.size(), stream_view_); - data.d_upper_bounds_.resize(data.upper_bounds.size(), stream_view_); - raft::copy(data.d_upper_bounds_.data(), - data.upper_bounds.data(), - data.upper_bounds.size(), - stream_view_); - data.d_dy_.resize(dy.size(), stream_view_); - raft::copy(data.d_dy_.data(), dy.data(), dy.size(), stream_view_); - data.d_dx_.resize(dx.size(), stream_view_); - raft::copy(data.d_h_.data(), data.primal_rhs.data(), data.primal_rhs.size(), stream_view_); - raft::copy(data.d_dual_rhs_.data(), data.dual_rhs.data(), data.dual_rhs.size(), stream_view_); - data.d_dz_.resize(dz.size(), stream_view_); - data.d_dv_.resize(dv.size(), stream_view_); - data.d_dw_.resize(data.bound_rhs.size(), stream_view_); - raft::copy(data.d_dw_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream_view_); - data.d_dw_residual_.resize(data.n_upper_bounds, stream_view_); - data.d_wv_residual_.resize(data.d_complementarity_wv_rhs_.size(), stream_view_); - data.d_xz_residual_.resize(data.d_complementarity_xz_rhs_.size(), stream_view_); - data.d_primal_residual_.resize(lp.rhs.size(), stream_view_); - raft::copy(data.d_primal_residual_.data(), lp.rhs.data(), lp.rhs.size(), stream_view_); - data.d_bound_residual_.resize(data.bound_residual.size(), stream_view_); - data.d_upper_.resize(lp.upper.size(), stream_view_); - raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream_view_); - } + copy_step_rhs_to_device(data, stream_view_); + data.d_dx_.resize(dx.size(), stream_view_); + data.d_dy_.resize(dy.size(), stream_view_); + data.d_dz_.resize(dz.size(), stream_view_); + data.d_dv_.resize(dv.size(), stream_view_); // Solves the linear system // // dw dx dy dv dz - // [ 0 A 0 0 0 ] [ dw ] = [ rp ] - // [ I E' 0 0 0 ] [ dx ] [ rw ] - // [ 0 0 A' -E I ] [ dy ] [ rd ] - // [ 0 Z 0 0 X ] [ dv ] [ rxz ] - // [ V 0 0 W 0 ] [ dz ] [ rwv ] + // [ 0 A 0 0 0 ] [ dw ] = [ rp ] + // [ I E' 0 0 0 ] [ dx ] [ rw ] + // [ 0 0 A' -E I ] [ dy ] [ rd ] + // [ 0 Z(S) 0 0 X(S^-T)] [ dv ] [ rxz ] + // [ V 0 0 W 0 ] [ dz ] [ rwv ] + + // NT-scaling: + // \lambda = Sx = S^-T z + // Affine step: (\lambda + S \delta xa) \circ (\lambda + S^-T \delta za) = 0 + // S \delta xa + S^-T \delta za = - \lambda + // \delta za = -S^T (S \delta xa + \lambda) = - S^T S \delta xa -S^T \lambda= - S^T S \delta xa + // - z + if (has_soc && !data.cone_combined_step_) { + auto& cones = data.cones(); + cones.x = raft::device_span(data.d_x_.data() + cone_var_start, m_c); + cones.z = raft::device_span(data.d_z_.data() + cone_var_start, m_c); + launch_nt_scaling(cones, stream_view_); + } max_residual = 0.0; { raft::common::nvtx::range fun_scope("Barrier: GPU diag, inv diag and sqrt inv diag formation"); - // diag = z ./ x - // For native free variables (QP): use Q diagonal if available, otherwise a static regularizer - if (data.n_free_vars > 0) { - constexpr f_t free_var_reg = 1e-7; - if (data.Q.n > 0 && data.Q_diagonal) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple( - data.d_z_.data(), data.d_x_.data(), data.d_is_free_.data(), data.d_Q_diag_.data()), - data.d_diag_.data(), - data.d_diag_.size(), - [free_var_reg] HD(f_t z_j, f_t x_j, i_t is_free, f_t q_jj) { - if (!is_free) return z_j / x_j; - return (q_jj > f_t(0)) ? f_t(0) : free_var_reg; - }, - stream_view_.value()); - } else { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data(), data.d_is_free_.data()), - data.d_diag_.data(), - data.d_diag_.size(), - [free_var_reg] HD(f_t z_j, f_t x_j, i_t is_free) { - return is_free ? free_var_reg : (z_j / x_j); - }, - stream_view_.value()); - } + // Linear orthant barrier on [0, linear_size); direct-free vars get D = 0 here. + if (has_direct_free_linear) { + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + data.d_z_.data(), data.d_x_.data(), data.d_is_direct_free_linear_.data()), + data.d_diag_.data(), + linear_size, + [] HD(f_t z_j, f_t x_j, i_t is_direct_free_linear) { + constexpr f_t free_var_reg = 1e-7; + return is_direct_free_linear ? free_var_reg : (z_j / x_j); + }, + stream_view_.value()); } else { cub::DeviceTransform::Transform(cuda::std::make_tuple(data.d_z_.data(), data.d_x_.data()), data.d_diag_.data(), - data.d_diag_.size(), + linear_size, cuda::std::divides<>{}, stream_view_.value()); } RAFT_CHECK_CUDA(stream_view_); - // diag = z ./ x + E * (v ./ w) * E' + // SOC cone block (curvature from H / NT scaling elsewhere in augmented mode). + if (has_soc) { + thrust::fill_n( + rmm::exec_policy(stream_view_), data.d_diag_.begin() + cone_var_start, m_c, f_t(1)); + } + + // Upper-bound slacks: D_j += v_k/w_k. if (data.n_upper_bounds > 0) { cub::DeviceTransform::Transform( cuda::std::make_tuple( @@ -2365,29 +2801,62 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t 0 && data.Q_diagonal) { + // ADAT-only: fold diagonal Q and direct-free regularization (augmented KKT keeps Q explicit). + if (!use_augmented) { + if (data.Q.n > 0 && data.Q_diagonal) { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_Q_diag_.data(), data.d_diag_.data()), + data.d_diag_.data(), + data.d_diag_.size(), + [] HD(f_t Q_diag_j, f_t diag_j) { return diag_j + Q_diag_j; }, + stream_view_.value()); + RAFT_CHECK_CUDA(stream_view_); + } + + constexpr f_t free_var_reg = 1e-7; + if (data.Q.n > 0 && data.Q_diagonal) { + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + data.d_diag_.data(), data.d_is_direct_free_linear_.data(), data.d_Q_diag_.data()), + data.d_diag_.data(), + linear_size, + [free_var_reg] HD(f_t diag_j, i_t is_direct_free_linear, f_t q_jj) { + if (!is_direct_free_linear || q_jj > f_t(0)) return diag_j; + return diag_j + free_var_reg; + }, + stream_view_.value()); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_diag_.data(), data.d_is_direct_free_linear_.data()), + data.d_diag_.data(), + linear_size, + [free_var_reg] HD(f_t diag_j, i_t is_direct_free_linear) { + return is_direct_free_linear ? (diag_j + free_var_reg) : diag_j; + }, + stream_view_.value()); + } + RAFT_CHECK_CUDA(stream_view_); + } + + raft::copy(data.diag.data(), data.d_diag_.data(), data.d_diag_.size(), stream_view_); + + // inv_diag and h = A*inv_diag*... are only used for the ADAT solve path. + if (!use_augmented) { cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_Q_diag_.data(), data.d_diag_.data()), data.d_diag_.data(), + data.d_inv_diag.data(), data.d_diag_.size(), - [] HD(f_t Q_diag_j, f_t diag_j) { return diag_j + Q_diag_j; }, + [] HD(f_t diag) { return f_t(1) / diag; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); + raft::copy( + data.inv_diag.data(), data.d_inv_diag.data(), data.d_inv_diag.size(), stream_view_); } - - // inv_diag = 1.0 ./ diag - cub::DeviceTransform::Transform( - data.d_diag_.data(), - data.d_inv_diag.data(), - data.d_diag_.size(), - [] HD(f_t diag) { return f_t(1) / diag; }, - stream_view_.value()); - RAFT_CHECK_CUDA(stream_view_); - raft::copy(data.diag.data(), data.d_diag_.data(), data.d_diag_.size(), stream_view_); - raft::copy(data.inv_diag.data(), data.d_inv_diag.data(), data.d_inv_diag.size(), stream_view_); } + // Track whether we (re)factorize on this call. + const bool did_factorize = !data.has_factorization; + // Form A*D*A' or the augmented system and factorize it if (!data.has_factorization) { raft::common::nvtx::range fun_scope("Barrier: ADAT"); @@ -2395,6 +2864,8 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t 0) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_inv_diag.data(), - data.d_tmp3_.data(), - data.d_complementarity_xz_rhs_.data(), - data.d_x_.data(), - data.d_dual_rhs_.data(), - data.d_is_free_.data()), - thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()), - lp.num_cols, - [] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs, i_t is_free) - -> thrust::tuple { - const f_t xz_term = is_free ? f_t(0) : (complementarity_xz_rhs / x); - const f_t tmp = tmp3 - xz_term + dual_rhs; - return {tmp, inv_diag * tmp}; - }, - stream_view_.value()); - } else { + if (data.n_upper_bounds > 0) { cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_inv_diag.data(), - data.d_tmp3_.data(), - data.d_complementarity_xz_rhs_.data(), - data.d_x_.data(), - data.d_dual_rhs_.data()), - thrust::make_zip_iterator(data.d_tmp3_.data(), data.d_tmp4_.data()), - lp.num_cols, - [] HD(f_t inv_diag, f_t tmp3, f_t complementarity_xz_rhs, f_t x, f_t dual_rhs) - -> thrust::tuple { - const f_t tmp = tmp3 + -(complementarity_xz_rhs / x) + dual_rhs; - return {tmp, inv_diag * tmp}; + cuda::std::make_tuple(data.d_bound_rhs_.data(), + data.d_v_.data(), + data.d_complementarity_wv_rhs_.data(), + data.d_w_.data()), + thrust::make_permutation_iterator(data.d_tmp3_.data(), data.d_upper_bounds_.data()), + data.n_upper_bounds, + [] HD(f_t bound_rhs, f_t v, f_t complementarity_wv_rhs, f_t w) { + return (complementarity_wv_rhs - v * bound_rhs) / w; }, stream_view_.value()); + RAFT_CHECK_CUDA(stream_view_); } + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_tmp3_.data(), + data.d_complementarity_target_.data(), + data.d_dual_rhs_.data(), + data.d_is_direct_free_linear_.data()), + data.d_tmp3_.data(), + lp.num_cols, + [] HD(f_t tmp3, f_t target, f_t dual_rhs, i_t is_direct_free_linear) { + const f_t comp_term = is_direct_free_linear ? f_t(0) : target; + return tmp3 + dual_rhs - comp_term; + }, + stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); raft::copy(data.d_r1_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); raft::copy(data.d_r1_prime_.data(), data.d_tmp3_.data(), data.d_tmp3_.size(), stream_view_); + } - // h <- A @ tmp4 .+ primal_rhs + if (!use_augmented) { + raft::common::nvtx::range fun_scope("Barrier: GPU compute H"); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(data.d_inv_diag.data(), data.d_tmp3_.data()), + data.d_tmp4_.data(), + lp.num_cols, + [] HD(f_t inv_diag, f_t tmp3) { return inv_diag * tmp3; }, + stream_view_.value()); + RAFT_CHECK_CUDA(stream_view_); data.cusparse_view_.spmv(1, data.cusparse_tmp4_, 1, data.cusparse_h_); } if (use_augmented) { raft::common::nvtx::range fun_scope("Barrier: GPU augmented solve"); - // r1 <- dual_rhs -complementarity_xz_rhs ./ x + E * ((complementarity_wv_rhs - v .* bound_rhs) - // ./ w) - - rmm::device_uvector d_augmented_rhs(lp.num_cols + lp.num_rows, stream_view_); - raft::copy(d_augmented_rhs.data(), data.d_r1_.data(), lp.num_cols, stream_view_); - raft::copy( - d_augmented_rhs.data() + lp.num_cols, data.primal_rhs.data(), lp.num_rows, stream_view_); - rmm::device_uvector d_augmented_soln(lp.num_cols + lp.num_rows, stream_view_); - data.chol->solve(d_augmented_rhs, d_augmented_soln); + // Augmented RHS [dx; dy]: primal block is d_r1_ (assembled above). + // linear j: dual_rhs[j] - complementarity_target[j] + // + E_j*((complementarity_wv_rhs - v.*bound_rhs)./w) (target = xz_rhs/x; free: 0) + // cone j: dual_rhs[j] - complementarity_target[j] (NT target: -z or combined centering + // term) + // Constraint block: primal_rhs. + + raft::copy(data.d_augmented_rhs_.data(), data.d_r1_.data(), lp.num_cols, stream_view_); + raft::copy(data.d_augmented_rhs_.data() + lp.num_cols, + data.primal_rhs.data(), + lp.num_rows, + stream_view_); + data.chol->solve(data.d_augmented_rhs_, data.d_augmented_soln_); struct op_t { op_t(iteration_data_t& data) : data_(data) {} iteration_data_t& data_; @@ -2525,14 +2985,38 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(op, d_augmented_rhs, d_augmented_soln); + const f_t solve_err = + iterative_refinement(op, data.d_augmented_rhs_, data.d_augmented_soln_); if (solve_err > 1e-1) { settings.log.printf("|| Aug (dx, dy) - aug_rhs || %e after IR\n", solve_err); } + + // Adaptive regularization: increase/decrease based on IR quality. + // Only adapt on calls where we actually (re)factorized — the affine step. + if (did_factorize && data.has_cones()) { + constexpr f_t min_perturb = 1e-8; + constexpr f_t max_perturb = 1e-1; + if (solve_err > 1e-2) { + f_t old_dp = dual_perturb; + dual_perturb = std::min(max_perturb, dual_perturb * 10.0); + primal_perturb = std::min(max_perturb, primal_perturb * 10.0); + settings.log.printf( + " reg UP: %e -> %e (solve_err=%e)\n", old_dp, dual_perturb, solve_err); + } else if (solve_err < 1e-4) { + f_t old_dp = dual_perturb; + dual_perturb = std::max(min_perturb, dual_perturb / 10.0); + primal_perturb = std::max(min_perturb, primal_perturb / 10.0); + if (old_dp != dual_perturb) { + settings.log.printf( + " reg DOWN: %e -> %e (solve_err=%e)\n", old_dp, dual_perturb, solve_err); + } + } + } } - raft::copy(data.d_dx_.data(), d_augmented_soln.data(), lp.num_cols, stream_view_); - raft::copy(data.d_dy_.data(), d_augmented_soln.data() + lp.num_cols, lp.num_rows, stream_view_); + raft::copy(data.d_dx_.data(), data.d_augmented_soln_.data(), lp.num_cols, stream_view_); + raft::copy( + data.d_dy_.data(), data.d_augmented_soln_.data() + lp.num_cols, lp.num_rows, stream_view_); raft::copy(dx.data(), data.d_dx_.data(), lp.num_cols, stream_view_); raft::copy(dy.data(), data.d_dy_.data(), lp.num_rows, stream_view_); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); @@ -2774,35 +3258,25 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t 0) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(), - data.d_z_.data(), - data.d_dx_.data(), - data.d_x_.data(), - data.d_is_free_.data()), - data.d_dz_.data(), - data.d_dz_.size(), - [] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x, i_t is_free) { - return is_free ? f_t(0) : ((complementarity_xz_rhs - z * dx) / x); - }, - stream_view_.value()); - } else { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(data.d_complementarity_xz_rhs_.data(), - data.d_z_.data(), - data.d_dx_.data(), - data.d_x_.data()), - data.d_dz_.data(), - data.d_dz_.size(), - [] HD(f_t complementarity_xz_rhs, f_t z, f_t dx, f_t x) { - return (complementarity_xz_rhs - z * dx) / x; - }, - stream_view_.value()); + const i_t linear_dz_size = has_soc ? cone_var_start : static_cast(data.d_dz_.size()); + + if (has_soc) { + recover_cone_dz_from_target( + raft::device_span(data.d_dx_.data() + cone_var_start, m_c), + data.cones(), + raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c), + raft::device_span(data.d_dz_.data() + cone_var_start, m_c), + stream_view_); } - RAFT_CHECK_CUDA(stream_view_); + + recover_linear_orthant_dz( + raft::device_span(data.d_complementarity_target_.data(), linear_dz_size), + raft::device_span(data.d_z_.data(), linear_dz_size), + raft::device_span(data.d_dx_.data(), linear_dz_size), + raft::device_span(data.d_x_.data(), linear_dz_size), + raft::device_span(data.d_dz_.data(), linear_dz_size), + raft::device_span(data.d_is_direct_free_linear_.data(), linear_dz_size), + stream_view_); raft::copy(dz.data(), data.d_dz_.data(), data.d_dz_.size(), stream_view_); } @@ -2810,18 +3284,29 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t out, + raft::device_span rhs, + raft::device_span z, + raft::device_span dz_span, + raft::device_span dx_span, + raft::device_span x) { + if (out.empty()) return; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(rhs.data(), z.data(), dz_span.data(), dx_span.data(), x.data()), + out.data(), + out.size(), + [] HD(f_t complementarity_xz_rhs, f_t z_val, f_t dz_val, f_t dx_val, f_t x_val) { + return z_val * dx_val + x_val * dz_val - complementarity_xz_rhs; + }, + stream_view_.value()); + }; + compute_linear_xz_residual( + raft::device_span(data.d_xz_residual_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_z_.data(), linear_size), + raft::device_span(data.d_dz_.data(), linear_size), + raft::device_span(data.d_dx_.data(), linear_size), + raft::device_span(data.d_x_.data(), linear_size)); RAFT_CHECK_CUDA(stream_view_); const f_t xz_residual_norm = device_vector_norm_inf(data.d_xz_residual_, stream_view_); @@ -2981,40 +3466,192 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t -void barrier_solver_t::compute_affine_rhs(iteration_data_t& data) +void copy_host_iterate_to_device(iteration_data_t& data, rmm::cuda_stream_view stream) { - raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); + raft::common::nvtx::range fun_scope("Barrier: copy_host_iterate_to_device"); - data.primal_rhs = data.primal_residual; - data.bound_rhs = data.bound_residual; - data.dual_rhs = data.dual_residual; + data.d_x_.resize(data.x.size(), stream); + data.d_z_.resize(data.z.size(), stream); + raft::copy(data.d_x_.data(), data.x.data(), data.x.size(), stream); + raft::copy(data.d_z_.data(), data.z.data(), data.z.size(), stream); - raft::copy(data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_residual_.data(), - data.d_complementarity_xz_residual_.size(), - stream_view_); - raft::copy(data.d_complementarity_wv_rhs_.data(), - data.d_complementarity_wv_residual_.data(), - data.d_complementarity_wv_residual_.size(), - stream_view_); + data.d_w_.resize(data.w.size(), stream); + data.d_v_.resize(data.v.size(), stream); + raft::copy(data.d_w_.data(), data.w.data(), data.w.size(), stream); + raft::copy(data.d_v_.data(), data.v.data(), data.v.size(), stream); + + data.d_y_.resize(data.y.size(), stream); + raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream); + + data.d_upper_bounds_.resize(data.upper_bounds.size(), stream); + raft::copy( + data.d_upper_bounds_.data(), data.upper_bounds.data(), data.upper_bounds.size(), stream); +} + +// One-time static problem data (constant across barrier iterations). +template +void copy_static_problem_data_to_device(const lp_problem_t& lp, + iteration_data_t& data, + rmm::cuda_stream_view stream) +{ + raft::common::nvtx::range fun_scope("Barrier: copy_static_problem_data_to_device"); + + data.d_primal_residual_.resize(lp.rhs.size(), stream); + raft::copy(data.d_primal_residual_.data(), lp.rhs.data(), lp.rhs.size(), stream); + + data.d_upper_.resize(lp.upper.size(), stream); + raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream); + + data.d_bound_residual_.resize(data.bound_residual.size(), stream); + data.d_dw_residual_.resize(data.n_upper_bounds, stream); +} - // x.*z -> -x .* z +// Per Mehrotra step: affine/corrector RHS only (iterate stays on device from initial sync / +// next_iterate). +template +void copy_step_rhs_to_device(iteration_data_t& data, rmm::cuda_stream_view stream) +{ + raft::common::nvtx::range fun_scope("Barrier: copy_step_rhs_to_device"); + + data.d_bound_rhs_.resize(data.bound_rhs.size(), stream); + raft::copy(data.d_bound_rhs_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream); + + raft::copy(data.d_h_.data(), data.primal_rhs.data(), data.primal_rhs.size(), stream); + raft::copy(data.d_dual_rhs_.data(), data.dual_rhs.data(), data.dual_rhs.size(), stream); + + data.d_dw_.resize(data.bound_rhs.size(), stream); + raft::copy(data.d_dw_.data(), data.bound_rhs.data(), data.bound_rhs.size(), stream); + + data.d_xz_residual_.resize(data.d_complementarity_xz_rhs_.size(), stream); + data.d_wv_residual_.resize(data.d_complementarity_wv_rhs_.size(), stream); +} + +template +void fill_linear_complementarity_target(iteration_data_t& data, + raft::device_span target, + raft::device_span xz_rhs, + raft::device_span x, + rmm::cuda_stream_view stream) +{ + if (target.empty()) return; cub::DeviceTransform::Transform( - data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_rhs_.data(), - data.d_complementarity_xz_rhs_.size(), - [] HD(f_t xz_rhs) { return -xz_rhs; }, - stream_view_.value()); - RAFT_CHECK_CUDA(stream_view_); - // w.*v -> -w .* v + cuda::std::make_tuple(xz_rhs.data(), x.data(), data.d_is_direct_free_linear_.data()), + target.data(), + target.size(), + [] HD(f_t complementarity_xz_rhs, f_t x_val, i_t is_direct_free_linear) { + if (is_direct_free_linear) return f_t(0); + return complementarity_xz_rhs / x_val; + }, + stream.value()); + RAFT_CHECK_CUDA(stream); +} + +template +static f_t host_complementarity_residual_norm(const iteration_data_t& data, + const std::vector& second_order_cone_dims, + rmm::cuda_stream_view stream) +{ + const i_t linear_xz_size = data.linear_xz_size(data.complementarity_xz_residual.size()); + raft::host_span linear_xz_span = + raft::host_span(data.complementarity_xz_residual.data(), linear_xz_size); + f_t complementarity_residual_norm = + std::max(vector_norm_inf(linear_xz_span, stream), + vector_norm_inf(data.complementarity_wv_residual, stream)); + if (data.has_cones()) { + f_t cone_complementarity_norm = f_t(0); + i_t offset = data.cone_start(); + for (i_t q_k : second_order_cone_dims) { + f_t cone_dot = f_t(0); + for (i_t j = 0; j < q_k; ++j) { + cone_dot += data.complementarity_xz_residual[offset + j]; + } + cone_complementarity_norm = std::max(cone_complementarity_norm, cone_dot); + offset += q_k; + } + complementarity_residual_norm = + std::max(complementarity_residual_norm, cone_complementarity_norm); + } + return complementarity_residual_norm; +} + +template +void fill_affine_cone_complementarity_target(iteration_data_t& data, + i_t cone_var_start, + i_t m_c, + rmm::cuda_stream_view stream) +{ + if (m_c == 0) return; + auto& cones = data.cones(); + cones.x = raft::device_span(data.d_x_.data() + cone_var_start, m_c); + cones.z = raft::device_span(data.d_z_.data() + cone_var_start, m_c); + auto cone_target = + raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c); cub::DeviceTransform::Transform( - data.d_complementarity_wv_rhs_.data(), - data.d_complementarity_wv_rhs_.data(), - data.d_complementarity_wv_rhs_.size(), - [] HD(f_t wv_rhs) { return -wv_rhs; }, - stream_view_.value()); + cones.z.data(), cone_target.data(), m_c, [] HD(f_t z_val) { return -z_val; }, stream.value()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CHECK_CUDA(stream); +} + +template +void fill_corrector_cone_complementarity_target(iteration_data_t& data, + i_t cone_var_start, + i_t m_c, + f_t sigma_mu, + rmm::cuda_stream_view stream) +{ + if (m_c == 0) return; + auto& cones = data.cones(); + cones.x = raft::device_span(data.d_x_.data() + cone_var_start, m_c); + cones.z = raft::device_span(data.d_z_.data() + cone_var_start, m_c); + auto cone_target = + raft::device_span(data.d_complementarity_target_.data() + cone_var_start, m_c); + compute_combined_cone_rhs_term( + raft::device_span(data.d_dx_aff_.data() + cone_var_start, m_c), + raft::device_span(data.d_dz_aff_.data() + cone_var_start, m_c), + cones, + sigma_mu, + cone_target, + stream); +} + +template +void barrier_solver_t::compute_affine_rhs(iteration_data_t& data) +{ + raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); + const bool has_soc = data.has_cones(); + const i_t linear_size = data.linear_xz_size(lp.num_cols); + const i_t cone_var_start = data.cone_start(); + const i_t m_c = data.cone_entry_count(); + + data.primal_rhs = data.primal_residual; + data.bound_rhs = data.bound_residual; + data.dual_rhs = data.dual_residual; + data.cone_combined_step_ = false; + data.cone_sigma_mu_ = f_t(0); + + // xz -> -x .* z; + negate_complementarity_rhs( + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_residual_.data(), linear_size), + stream_view_); + // w.*v -> -w .* v; + negate_complementarity_rhs(cuopt::make_span(data.d_complementarity_wv_rhs_), + cuopt::make_span(data.d_complementarity_wv_residual_), + stream_view_); RAFT_CHECK_CUDA(stream_view_); + + fill_linear_complementarity_target( + data, + raft::device_span(data.d_complementarity_target_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_x_.data(), linear_size), + stream_view_); + if (has_soc) { + cuopt_assert(cone_var_start + m_c == lp.num_cols, "barrier expects [linear | cone] layout"); + fill_affine_cone_complementarity_target(data, cone_var_start, m_c, stream_view_); + } } template @@ -3022,6 +3659,7 @@ void barrier_solver_t::compute_target_mu( iteration_data_t& data, f_t mu, f_t& mu_aff, f_t& sigma, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_target_mu"); + const bool has_soc = data.has_cones(); f_t complementarity_aff_sum = 0.0; // TMP no copy and data should always be on the GPU @@ -3035,12 +3673,27 @@ void barrier_solver_t::compute_target_mu( raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_); raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_); - f_t step_primal_aff = std::min(gpu_max_step_to_boundary(data, data.d_w_, data.d_dw_aff_), - gpu_max_step_to_boundary(data, data.d_x_, data.d_dx_aff_)); - f_t step_dual_aff = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_aff_), - gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_aff_)); + f_t step_primal_aff = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_aff_), + compute_nonnegative_step_length(data, data.d_x_, data.d_dx_aff_)); + f_t step_dual_aff = std::min(compute_nonnegative_step_length(data, data.d_v_, data.d_dv_aff_), + compute_nonnegative_step_length(data, data.d_z_, data.d_dz_aff_)); + + if (has_soc) { + i_t cs = data.cone_start(); + i_t mc = data.cone_entry_count(); + auto [cone_p, cone_d] = + compute_cone_step_length(data.cones(), + raft::device_span(data.d_dx_aff_.data() + cs, mc), + raft::device_span(data.d_dz_aff_.data() + cs, mc), + f_t(1), + stream_view_); + step_primal_aff = std::min(step_primal_aff, cone_p); + step_dual_aff = std::min(step_dual_aff, cone_d); + } - if (data.Q.n > 0) { step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); } + if (data.Q.n > 0 || has_soc) { + step_primal_aff = step_dual_aff = std::min(step_primal_aff, step_dual_aff); + } // Compute complementarity_xz_aff_sum = sum(x_aff * z_aff), // where x_aff = x + step_primal_aff * dx_aff and z_aff = z + step_dual_aff * dz_aff @@ -3091,55 +3744,42 @@ void barrier_solver_t::compute_target_mu( stream_view_); complementarity_aff_sum = complementarity_xz_aff_sum + complementarity_wv_aff_sum; - f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); - mu_denom -= static_cast(data.n_free_vars); - mu_denom = std::max(mu_denom, f_t(1.0)); - mu_aff = complementarity_aff_sum / mu_denom; - sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); - new_mu = sigma * mu_aff; -} - -template -static void fill_linear_cc_rhs(iteration_data_t& data, - f_t new_mu, - raft::device_span out, - raft::device_span dx_aff, - raft::device_span dz_aff, - rmm::cuda_stream_view stream_view) -{ - if (out.empty()) return; - if (data.n_free_vars > 0) { - auto is_free_ptr = data.d_is_free_.data(); - cub::DeviceTransform::Transform( - cuda::std::make_tuple(dx_aff.data(), dz_aff.data(), is_free_ptr), - out.data(), - out.size(), - [new_mu] HD(f_t dx_aff_val, f_t dz_aff_val, i_t is_free) { - return is_free ? f_t(0) : (-(dx_aff_val * dz_aff_val) + new_mu); - }, - stream_view.value()); - } else { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(dx_aff.data(), dz_aff.data()), - out.data(), - out.size(), - [new_mu] HD(f_t dx_aff_val, f_t dz_aff_val) { return -(dx_aff_val * dz_aff_val) + new_mu; }, - stream_view.value()); - } + const f_t mu_denom = data.complementarity_degree(data.x.size(), data.n_upper_bounds); + mu_aff = complementarity_aff_sum / mu_denom; + sigma = std::max(0.0, std::min(1.0, std::pow(mu_aff / mu, 3.0))); + new_mu = sigma * mu_aff; } template void barrier_solver_t::compute_cc_rhs(iteration_data_t& data, f_t& new_mu) { raft::common::nvtx::range fun_scope("Barrier: compute_cc_rhs"); + const bool has_soc = data.has_cones(); + const i_t linear_size = data.linear_xz_size(lp.num_cols); + + fill_linear_cc_rhs( + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_dx_aff_.data(), linear_size), + raft::device_span(data.d_dz_aff_.data(), linear_size), + new_mu, + raft::device_span(data.d_is_direct_free_linear_.data(), linear_size), + stream_view_); + + const i_t cone_var_start = data.cone_start(); + const i_t m_c = data.cone_entry_count(); + + fill_linear_complementarity_target( + data, + raft::device_span(data.d_complementarity_target_.data(), linear_size), + raft::device_span(data.d_complementarity_xz_rhs_.data(), linear_size), + raft::device_span(data.d_x_.data(), linear_size), + stream_view_); + if (has_soc) { + cuopt_assert(cone_var_start + m_c == lp.num_cols, "barrier expects [linear | cone] layout"); + fill_corrector_cone_complementarity_target( + data, cone_var_start, m_c, new_mu, stream_view_); + } - fill_linear_cc_rhs(data, - new_mu, - cuopt::make_span(data.d_complementarity_xz_rhs_), - cuopt::make_span(data.d_dx_aff_), - cuopt::make_span(data.d_dz_aff_), - stream_view_); - RAFT_CHECK_CUDA(stream_view_); cub::DeviceTransform::Transform( cuda::std::make_tuple(data.d_dw_aff_.data(), data.d_dv_aff_.data()), data.d_complementarity_wv_rhs_.data(), @@ -3147,21 +3787,19 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data [new_mu] HD(f_t dw_aff, f_t dv_aff) { return -(dw_aff * dv_aff) + new_mu; }, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - // TMP should be CPU to 0 if CPU and GPU to 0 if GPU data.primal_rhs.set_scalar(0.0); data.bound_rhs.set_scalar(0.0); data.dual_rhs.set_scalar(0.0); + data.cone_combined_step_ = has_soc; + data.cone_sigma_mu_ = has_soc ? new_mu : f_t(0); } template void barrier_solver_t::compute_final_direction(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_final_direction"); - // TODO Nicolas: Redundant copies - data.d_y_.resize(data.y.size(), stream_view_); data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_); - raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream_view_); raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_); #ifdef FINITE_CHECK @@ -3224,17 +3862,32 @@ void barrier_solver_t::compute_primal_dual_step_length(iteration_data_ f_t& step_dual) { raft::common::nvtx::range fun_scope("Barrier: compute_primal_dual_step_length"); + const bool has_soc = data.has_cones(); + f_t max_step_primal = 0.0; f_t max_step_dual = 0.0; - max_step_primal = std::min(gpu_max_step_to_boundary(data, data.d_w_, data.d_dw_), - gpu_max_step_to_boundary(data, data.d_x_, data.d_dx_)); - max_step_dual = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_), - gpu_max_step_to_boundary(data, data.d_z_, data.d_dz_)); + max_step_primal = std::min(compute_nonnegative_step_length(data, data.d_w_, data.d_dw_), + compute_nonnegative_step_length(data, data.d_x_, data.d_dx_)); + max_step_dual = std::min(compute_nonnegative_step_length(data, data.d_v_, data.d_dv_), + compute_nonnegative_step_length(data, data.d_z_, data.d_dz_)); + + if (has_soc) { + i_t cs = data.cone_start(); + i_t mc = data.cone_entry_count(); + auto [cone_primal, cone_dual] = + compute_cone_step_length(data.cones(), + raft::device_span(data.d_dx_.data() + cs, mc), + raft::device_span(data.d_dz_.data() + cs, mc), + f_t(1), + stream_view_); + max_step_primal = std::min(max_step_primal, cone_primal); + max_step_dual = std::min(max_step_dual, cone_dual); + } step_primal = step_scale * max_step_primal; step_dual = step_scale * max_step_dual; - if (data.Q.n > 0) { step_primal = step_dual = std::min(step_primal, step_dual); } + if (data.Q.n > 0 || has_soc) { step_primal = step_dual = std::min(step_primal, step_dual); } } template @@ -3325,11 +3978,8 @@ void barrier_solver_t::compute_mu(iteration_data_t& data, f_ { raft::common::nvtx::range fun_scope("Barrier: compute_mu"); - f_t mu_denom = static_cast(data.x.size()) + static_cast(data.n_upper_bounds); - mu_denom -= static_cast(data.n_free_vars); // free vars don't contribute to mu - mu_denom = std::max(mu_denom, f_t(1.0)); - - mu = (data.sum_reduce_helper_.sum(data.d_complementarity_xz_residual_.begin(), + const f_t mu_denom = data.complementarity_degree(data.x.size(), data.n_upper_bounds); + mu = (data.sum_reduce_helper_.sum(data.d_complementarity_xz_residual_.begin(), data.d_complementarity_xz_residual_.size(), stream_view_) + data.sum_reduce_helper_.sum(data.d_complementarity_wv_residual_.begin(), @@ -3592,10 +4242,16 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t 0) { - settings.log.printf("Quadratic objective matrix: %d nonzeros\n", lp.Q.row_start[lp.Q.n]); + settings.log.printf("Quadratic objective matrix : %d nonzeros\n", lp.Q.row_start[lp.Q.n]); + } + if (lp.second_order_cone_dims.size() > 0) { + settings.log.printf("Second-order cones : %d\n", + static_cast(lp.second_order_cone_dims.size())); } - settings.log.printf("\n"); // Compute the number of free variables i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2; @@ -3613,7 +4269,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t 0) { create_Q(lp, Q); } iteration_data_t data( - lp, num_upper_bounds, presolve_info.free_variable_indices, Q, settings); + lp, num_upper_bounds, presolve_info.direct_free_variables, Q, settings); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { settings.log.printf("Barrier solver halted\n"); return lp_status_t::CONCURRENT_LIMIT; @@ -3651,6 +4307,10 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t(data, stream_view_); + copy_static_problem_data_to_device(lp, data, stream_view_); + compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); f_t primal_residual_norm = @@ -3658,10 +4318,11 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t(data.bound_residual, stream_view_)); f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); f_t complementarity_residual_norm = - std::max(vector_norm_inf(data.complementarity_xz_residual, stream_view_), - vector_norm_inf(data.complementarity_wv_residual, stream_view_)); - f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / - (static_cast(n) + static_cast(num_upper_bounds)); + host_complementarity_residual_norm(data, lp.second_order_cone_dims, stream_view_); + + f_t mu_denom = data.complementarity_degree(n, num_upper_bounds); + f_t mu = + (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / mu_denom; f_t norm_b = vector_norm_inf(data.b, stream_view_); f_t norm_c = vector_norm_inf(data.c, stream_view_); @@ -3686,6 +4347,11 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t( + const lp_problem_t& problem, const simplex_solver_settings_t& settings); template class barrier_solver_t; template class sparse_cholesky_base_t; template class sparse_cholesky_cudss_t; diff --git a/cpp/src/barrier/barrier.hpp b/cpp/src/barrier/barrier.hpp index 4d25fa930e..f7c93b6de1 100644 --- a/cpp/src/barrier/barrier.hpp +++ b/cpp/src/barrier/barrier.hpp @@ -19,6 +19,11 @@ #include namespace cuopt::linear_programming::dual_simplex { +/** Validates SOC layout on an lp_problem_t before barrier presolve/solve. */ +template +bool validate_barrier_cone_layout(const lp_problem_t& problem, + const simplex_solver_settings_t& settings); + template class iteration_data_t; // Forward declare @@ -92,15 +97,17 @@ class barrier_solver_t { f_t& dual_residual_norm, f_t& complementarity_residual_norm); - f_t gpu_max_step_to_boundary(iteration_data_t& data, - const rmm::device_uvector& x, - const rmm::device_uvector& dx); + f_t compute_nonnegative_step_length(iteration_data_t& data, + const rmm::device_uvector& x, + const rmm::device_uvector& dx); i_t gpu_compute_search_direction(iteration_data_t& data, pinned_dense_vector_t& dw, pinned_dense_vector_t& dx, pinned_dense_vector_t& dy, pinned_dense_vector_t& dv, pinned_dense_vector_t& dz, + f_t& dual_perturb, + f_t& primal_perturb, f_t& max_residual); private: diff --git a/cpp/src/barrier/second_order_cone_kernels.cuh b/cpp/src/barrier/second_order_cone_kernels.cuh new file mode 100644 index 0000000000..5a674fef70 --- /dev/null +++ b/cpp/src/barrier/second_order_cone_kernels.cuh @@ -0,0 +1,1081 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// ============================================================================= +// SOC (second-order cone) kernels for the cuOpt barrier solver. +// +// x_soc : cone primal block +// z_soc : cone dual block +// W, W^{-1} : Nesterov-Todd scaling matrix and inverse. W is symmetric for +// SOC, so W^{T} = W +// H : S^{T} S = S^{2}, the cone KKT block added to the +// primal-reduced system +// eta : sqrt(z_J / x_J), where x_J = sqrt(det_J(x_soc)) +// w : NT scaling direction with det_J(w) = 1 and +// w[head] = sqrt(1 + ||w_tail||^2) +// +// Cone vectors are packed flat: +// entries [cone_offsets[i], cone_offsets[i + 1]) belong to cone i. +// ============================================================================= + +namespace cuopt::linear_programming::dual_simplex { + +inline constexpr int soc_block_size = 256; + +/** + * Reusable device workspace for second-order cone kernels. + * + * The scratch object owns only temporary storage. Kernels may reuse the scalar + * slots and `temp_cone` sequentially inside a higher-level operation, but no + * persistent NT scaling or iterate state is stored here. + */ +template +struct cone_scratch_t { + i_t n_cones; // number of SOC blocks + size_t n_cone_entries; // total packed cone dimension + + rmm::device_uvector slots; // [n_slots * n_cones] + + // Per-cone step candidates before the final min reduction. + rmm::device_uvector step_alpha_primal; // [n_cones] + rmm::device_uvector step_alpha_dual; // [n_cones] + + // TODO: Consider moving this out to the barrier layer when we wire it in + rmm::device_uvector temp_cone; // [n_cone_entries] + + cone_scratch_t(i_t n_cones_in, size_t n_cone_entries_in, rmm::cuda_stream_view stream) + : n_cones(n_cones_in), + n_cone_entries(n_cone_entries_in), + slots(0, stream), + step_alpha_primal(0, stream), + step_alpha_dual(0, stream), + temp_cone(0, stream) + { + const size_t n_cones_size = static_cast(n_cones); + + slots.resize(n_cones_size * static_cast(n_slots), stream); + step_alpha_primal.resize(n_cones_size, stream); + step_alpha_dual.resize(n_cones_size, stream); + temp_cone.resize(n_cone_entries, stream); + } + + template + raft::device_span get_slot() const + { + static_assert(slot_idx >= 0 && slot_idx < n_slots, "scratch slot index out of range"); + const size_t n_cones_size = static_cast(n_cones); + const size_t begin = static_cast(slot_idx) * n_cones_size; + const size_t end = begin + n_cones_size; + return cuopt::make_span(slots, begin, end); + } + + template + raft::device_span get_slot() + { + const auto const_slot = static_cast(*this).template get_slot(); + return raft::device_span(const_cast(const_slot.data()), const_slot.size()); + } +}; + +struct to_size_t_t { + template + HD size_t operator()(value_t value) const + { + return value; + } +}; + +template +HD f_t cone_step_length_from_scalars( + f_t u0, f_t du0, f_t du_tail_sq, f_t u_tail_du_tail, f_t u_tail_sq, f_t alpha_max) +{ + const f_t a = du0 * du0 - du_tail_sq; + const f_t b = u0 * du0 - u_tail_du_tail; + const f_t c_raw = u0 * u0 - u_tail_sq; + const f_t c = c_raw > 0 ? c_raw : 0; + const f_t disc = b * b - a * c; + f_t alpha = alpha_max; + + if (u0 >= 0 && du0 < 0) { alpha = cuda::std::min(alpha, -u0 / du0); } + + if ((a > 0 && b > 0) || disc < 0) { return alpha; } + + if (a == 0) { + return alpha; + } else if (c == 0) { + alpha = a >= 0 ? alpha : 0; + } else { + const f_t t = -(b + copysign(sqrt(disc), b)); + f_t r1 = c / t; + f_t r2 = t / a; + if (r1 < 0) { r1 = alpha; } + if (r2 < 0) { r2 = alpha; } + alpha = cuda::std::min(alpha, cuda::std::min(r1, r2)); + } + + return alpha; +} + +template +__global__ void __launch_bounds__(soc_block_size) + step_length_single_kernel(raft::device_span u, + raft::device_span du, + raft::device_span alpha, + raft::device_span du_tail_sq, + raft::device_span u_tail_du_tail, + raft::device_span u_tail_sq, + raft::device_span cone_offsets, + f_t alpha_max, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const size_t off = cone_offsets[cone]; + alpha[cone] = cone_step_length_from_scalars( + u[off], du[off], du_tail_sq[cone], u_tail_du_tail[cone], u_tail_sq[cone], alpha_max); +} + +/** + * Device storage for second-order cone topology, NT scaling, and iterate views. + * + * Flat arrays are packed by cone: entries + * [cone_offsets[i], cone_offsets[i + 1]) belong to cone i, whose dimension is + * cone_dimensions[i]. + * + * The primal/dual cone vectors are non-owning spans over the SOC slice of the + * solver's global x/z vectors. The caller must keep the underlying storage + * alive for the lifetime of this object. + */ +template +struct cone_data_t { + // Topology. This is immutable after construction. + i_t n_cones; // number of SOC blocks + size_t n_cone_entries; // total packed cone dimension = sum(cone_dimensions) + + rmm::device_uvector cone_offsets; // [n_cones + 1], prefix sum of dimensions + rmm::device_uvector cone_dimensions; // [n_cones], dimension q_i of each cone + // Owning cone per entry for upcoming flat per-entry SOC kernels. + rmm::device_uvector element_cone_ids; // [n_cone_entries] + segmented_sum_t segmented_sum; + + // Non-owning iterate views over the cone portion of x/z. + raft::device_span x; // [n_cone_entries], SOC primal block + raft::device_span z; // [n_cone_entries], SOC dual block + + // Persistent Nesterov-Todd scaling state, recomputed from x/z each iteration. + rmm::device_uvector eta; // [n_cones], sqrt(|z|_J / |x|_J) + rmm::device_uvector w; // [n_cone_entries], unit-J-norm NT direction + rmm::device_uvector lambda; // [n_cone_entries], NT point lambda = W^{-T} z + + cone_scratch_t scratch; + + cone_data_t(std::span cone_dimensions_host, + raft::device_span x_in, + raft::device_span z_in, + rmm::cuda_stream_view stream) + : n_cones(cone_dimensions_host.size()), + n_cone_entries( + std::reduce(cone_dimensions_host.begin(), cone_dimensions_host.end(), size_t{0})), + cone_offsets(n_cones + 1, stream), + cone_dimensions(n_cones, stream), + element_cone_ids(n_cone_entries, stream), + segmented_sum(cone_dimensions_host, cuopt::make_span(cone_offsets), stream), + x(x_in), + z(z_in), + eta(n_cones, stream), + w(n_cone_entries, stream), + lambda(n_cone_entries, stream), + scratch(n_cones, n_cone_entries, stream) + { + raft::copy(cone_dimensions.data(), cone_dimensions_host.data(), n_cones, stream); + cone_offsets.set_element_to_zero_async(0, stream); + auto policy = rmm::exec_policy(stream); + + auto cone_dimensions_as_offsets = + thrust::make_transform_iterator(cone_dimensions.begin(), to_size_t_t{}); + thrust::inclusive_scan(policy, + cone_dimensions_as_offsets, + cone_dimensions_as_offsets + n_cones, + cone_offsets.begin() + 1, + cuda::std::plus{}); + + thrust::upper_bound(policy, + cone_offsets.begin() + 1, + cone_offsets.end(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n_cone_entries), + element_cone_ids.begin()); + segmented_sum.template prepare_workspace(stream); + } +}; + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_scaling_scalars_kernel(raft::device_span x, + raft::device_span z, + raft::device_span x_scale, + raft::device_span z_scale, + raft::device_span eta, + raft::device_span cone_offsets, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const size_t off = cone_offsets[cone]; + const f_t x_tail_norm = sqrt(x_scale[cone]); + const f_t z_tail_norm = sqrt(z_scale[cone]); + const f_t x_det = (x[off] - x_tail_norm) * (x[off] + x_tail_norm); + const f_t z_det = (z[off] - z_tail_norm) * (z[off] + z_tail_norm); + + x_scale[cone] = sqrt(x_det); + z_scale[cone] = sqrt(z_det); + eta[cone] = sqrt(z_scale[cone] / x_scale[cone]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_w_scale_kernel(raft::device_span w, + raft::device_span tail_sq, + raft::device_span w_scale, + raft::device_span cone_offsets, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const size_t cone_off = cone_offsets[cone]; + const f_t head = w[cone_off]; + const f_t tail_norm = sqrt(tail_sq[cone]); + const f_t residual = (head - tail_norm) * (head + tail_norm); + w_scale[cone] = sqrt(residual); +} + +/** + * Write unnormalized w: + * + * w_0 = z_0 / z_scale + x_0 / x_scale + * w_tail = z_tail / z_scale - x_tail / x_scale. + */ +template +__global__ void __launch_bounds__(soc_block_size) + nt_write_w_kernel(raft::device_span x, + raft::device_span z, + raft::device_span x_scale, + raft::device_span z_scale, + raft::device_span w, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= w.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + if (idx == cone_off) { + w[idx] = z[idx] / z_scale[cone] + x[idx] / x_scale[cone]; + return; + } + + w[idx] = z[idx] / z_scale[cone] - x[idx] / x_scale[cone]; +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_normalize_w_kernel(raft::device_span w, + raft::device_span w_scale, + raft::device_span element_cone_ids) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= w.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + w[idx] /= w_scale[cone]; +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_finalize_head_kernel(raft::device_span w, + raft::device_span normalized_tail_sq, + raft::device_span cone_offsets, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + w[cone_offsets[cone]] = sqrt(1 + normalized_tail_sq[cone]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + nt_write_lambda_kernel(raft::device_span x, + raft::device_span z, + raft::device_span x_scale, + raft::device_span z_scale, + raft::device_span w_scale, + raft::device_span lambda, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= lambda.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + const f_t x_scale_cone = x_scale[cone]; + const f_t z_scale_cone = z_scale[cone]; + const f_t gamma = static_cast(0.5) * w_scale[cone]; + const f_t head_scale = sqrt(x_scale_cone * z_scale_cone); + + if (local_idx == 0) { + lambda[idx] = gamma * head_scale; + return; + } + + const f_t x_head = x[cone_off]; + const f_t z_head = z[cone_off]; + const f_t denom = z_head / z_scale_cone + x_head / x_scale_cone + static_cast(2) * gamma; + const f_t coeff_z = (gamma + x_head / x_scale_cone) / z_scale_cone; + const f_t coeff_x = (gamma + z_head / z_scale_cone) / x_scale_cone; + + const f_t lambda_tail = (coeff_z * z[idx] + coeff_x * x[idx]) / denom; + lambda[idx] = lambda_tail * head_scale; +} + +/** + * Build Nesterov-Todd scaling for packed SOC blocks. + * + * Given interior cone primal/dual blocks x and z: + * + * det_J(x) = x_0^2 - ||x_tail||^2 + * det_J(z) = z_0^2 - ||z_tail||^2 + * x_scale = sqrt(det_J(x)), z_scale = sqrt(det_J(z)) + * eta = sqrt(z_scale / x_scale) + * w_tmp_0 = z_0 / z_scale + x_0 / x_scale + * w_tmp_tail = z_tail / z_scale - x_tail / x_scale + * w_scale = sqrt(det_J(w_tmp)) + * w = w_tmp / w_scale + * w_0 = sqrt(1 + ||w_tail||^2) to re-impose det_J(w) = 1 + * + * Scratch slots: + * 0: ||x_tail||^2 -> x_scale + * 1: ||z_tail||^2 -> z_scale + */ +template +void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view stream) +{ + auto x_scale = cones.scratch.template get_slot<0>(); + auto z_scale = cones.scratch.template get_slot<1>(); + auto w_scale = cones.scratch.template get_slot<2>(); + + const auto span_x = cones.x; + const auto span_z = cones.z; + const auto cone_offsets = cuopt::make_span(cones.cone_offsets); + const auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + + auto x_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_x, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : span_x[idx] * span_x[idx]; + }); + cones.segmented_sum(x_tail_sq_terms, x_scale, stream); + + auto z_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [span_z, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : span_z[idx] * span_z[idx]; + }); + cones.segmented_sum(z_tail_sq_terms, z_scale, stream); + + const size_t cone_grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + nt_finalize_scaling_scalars_kernel + <<>>( + cones.x, cones.z, x_scale, z_scale, cuopt::make_span(cones.eta), cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + const size_t element_grid_dim = raft::ceildiv(cones.n_cone_entries, soc_block_size); + + auto w = cuopt::make_span(cones.w); + nt_write_w_kernel<<>>( + cones.x, cones.z, x_scale, z_scale, w, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + auto unnormalized_tail_sq_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [cone_offsets, element_cone_ids, w] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * w[idx]; + }); + cones.segmented_sum(unnormalized_tail_sq_terms, w_scale, stream); + + nt_finalize_w_scale_kernel<<>>( + w, w_scale, w_scale, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + nt_normalize_w_kernel + <<>>(w, w_scale, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Persist lambda while w_scale still stores sqrt(det_J(w_tmp)). + nt_write_lambda_kernel + <<>>(cones.x, + cones.z, + x_scale, + z_scale, + w_scale, + cuopt::make_span(cones.lambda), + cone_offsets, + element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // w_scale is overwritten from here + auto normalized_tail_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [cone_offsets, element_cone_ids, w] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * w[idx]; + }); + cones.segmented_sum(normalized_tail_terms, w_scale, stream); + + nt_finalize_head_kernel<<>>( + cuopt::make_span(cones.w), w_scale, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_w_inv_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + const f_t w0 = w[cone_off]; + const f_t zeta = tail_dot[cone]; + const f_t v0 = v[cone_off]; + const f_t inv_eta = f_t(1) / eta[cone]; + + if (local_idx == 0) { + out[idx] = inv_eta * (w0 * v0 - zeta); + return; + } + + const f_t coeff = -v0 + zeta / (f_t(1) + w0); + out[idx] = inv_eta * (v[idx] + coeff * w[idx]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_w_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span tail_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + const f_t w0 = w[cone_off]; + const f_t zeta = tail_dot[cone]; + const f_t v0 = v[cone_off]; + const f_t cone_eta = eta[cone]; + + if (local_idx == 0) { + out[idx] = cone_eta * (w0 * v0 + zeta); + return; + } + + const f_t coeff = v0 + zeta / (f_t(1) + w0); + out[idx] = cone_eta * (v[idx] + coeff * w[idx]); +} + +template +__global__ void __launch_bounds__(soc_block_size) + apply_hessian_write_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span wv_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span bias, + f_t output_scale, + f_t bias_scale) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + const f_t eta_sq = (eta[cone] * eta[cone]); + const f_t coeff = 2 * wv_dot[cone] * eta_sq; + const int sign = (local_idx == 0) * 2 - 1; + const f_t value = coeff * w[idx] - eta_sq * v[idx] * sign; + const f_t h_value = output_scale * value; + + out[idx] = bias.empty() ? h_value : bias_scale * bias[idx] + h_value; +} + +template +__global__ void __launch_bounds__(soc_block_size) + gather_cone_heads_kernel(raft::device_span values, + raft::device_span heads, + raft::device_span cone_offsets, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + heads[cone] = values[cone_offsets[cone]]; +} + +/** + * Build the Mehrotra corrector shift: + * + * d = (W dx_aff) o (W^{-T} dz_aff) - sigma_mu e. + * + * On entry, `scaled_dx` is W dx_aff and `scaled_dz` is W^{-T} dz_aff. The + * cone head uses the full dot product, and tail entries use the SOC Jordan + * product: + * + * d_0 = - sigma_mu + * d_tail = scaled_dx_0 * scaled_dz_tail + scaled_dz_0 * scaled_dx_tail. + */ +template +__global__ void __launch_bounds__(soc_block_size) + combined_cone_shift_write_kernel(raft::device_span shift, + raft::device_span scaled_dx, + raft::device_span scaled_dz, + raft::device_span full_dot, + raft::device_span scaled_dx_head, + raft::device_span scaled_dz_head, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + f_t sigma_mu) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= shift.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + if (local_idx == 0) { + shift[idx] = full_dot[cone] - sigma_mu; + return; + } + + shift[idx] = scaled_dx_head[cone] * scaled_dz[idx] + scaled_dz_head[cone] * scaled_dx[idx]; +} + +/** + * Per-cone scalar stage for p = lambda \ d: + * + * p_0 = (lambda_0 d_0 - ) / det_J(lambda) + * inv_lambda_0 = 1 / lambda_0. + * + * A second flat kernel writes `-p`, which lets the final W^{-1} call produce + * q = -W^{-1} p without adding an output-scale argument to W^{-1}. + */ +template +__global__ void __launch_bounds__(soc_block_size) + jordan_divide_by_lambda_scalar_kernel(raft::device_span shift, + raft::device_span nt_point, + raft::device_span lambda_tail_dot, + raft::device_span lambda_tail_sq, + raft::device_span p0, + raft::device_span inv_lambda0, + raft::device_span cone_offsets, + i_t n_cones) +{ + const i_t cone = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (cone >= n_cones) { return; } + + const size_t cone_off = cone_offsets[cone]; + const f_t lambda0 = nt_point[cone_off]; + const f_t lambda_tail_norm = sqrt(lambda_tail_sq[cone]); + const f_t det_lambda = (lambda0 - lambda_tail_norm) * (lambda0 + lambda_tail_norm); + + // repurpose the heads in lambda_tail_dot, lambda_tail_sq for each cone + p0[cone] = (lambda0 * shift[cone_off] - lambda_tail_dot[cone]) / det_lambda; + inv_lambda0[cone] = 1 / lambda0; +} + +template +__global__ void __launch_bounds__(soc_block_size) + jordan_divide_by_lambda_write_kernel(raft::device_span shift, + raft::device_span nt_point, + raft::device_span p0, + raft::device_span inv_lambda0, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span out) +{ + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= out.size()) { return; } + + const i_t cone = element_cone_ids[idx]; + const size_t cone_off = cone_offsets[cone]; + const size_t local_idx = idx - cone_off; + + if (local_idx == 0) { + out[idx] = -p0[cone]; + return; + } + + out[idx] = (p0[cone] * nt_point[idx] - shift[idx]) * inv_lambda0[cone]; +} + +/** + * Apply the Nesterov-Todd scaling matrix: out = W^{-1} v. + * + * For each cone: + * zeta = + * (W^{-1}v)_0 = inv_eta * (w_0 v_0 - zeta) + * (W^{-1}v)_tail = inv_eta * (v_tail + (-v_0 + zeta / (1 + w_0)) w_tail) + */ +template +void apply_w_inv(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto tail_dot = cones.scratch.template get_slot<0>(); + + auto tail_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [v, w, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * v[idx]; + }); + cones.segmented_sum(tail_terms, tail_dot, stream); + + const size_t grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_w_inv_write_kernel<<>>( + v, out, w, eta, tail_dot, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Apply the multiplication of Nesterov-Todd scaling matrix: + * out = W v. + * + * For each cone, + * zeta = + * (W * v)_0 = eta * (w_0 v_0 + zeta) + * (W * v)_tail = + * eta * (v_tail + (v_0 + zeta / (1 + w_0)) w_tail) + */ +template +void apply_w(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto tail_dot = cones.scratch.template get_slot<0>(); + + auto tail_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [v, w, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : w[idx] * v[idx]; + }); + cones.segmented_sum(tail_terms, tail_dot, stream); + + const size_t grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_w_write_kernel<<>>( + v, out, w, eta, tail_dot, cone_offsets, element_cone_ids); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Apply the cone KKT block H = S^T S = S^2. + * + * With rho = : + * (Hv)_0 = eta^{2} (2 w_0 rho - v_0) + * (Hv)_tail = eta^{2} (2 w_tail rho + v_tail) + */ +template +void apply_hessian(raft::device_span v, + raft::device_span out, + cone_data_t& cones, + rmm::cuda_stream_view stream, + f_t output_scale = 1, + raft::device_span bias = {}, + f_t bias_scale = 0) +{ + auto w = cuopt::make_span(cones.w); + auto eta = cuopt::make_span(cones.eta); + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto wv_dot = cones.scratch.template get_slot<0>(); + + auto wv_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [v, w] HD(size_t idx) -> f_t { return w[idx] * v[idx]; }); + cones.segmented_sum(wv_terms, wv_dot, stream); + + const size_t grid_dim = raft::ceildiv(out.size(), soc_block_size); + apply_hessian_write_kernel<<>>( + v, out, w, eta, wv_dot, cone_offsets, element_cone_ids, bias, output_scale, bias_scale); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Recover the SOC dual direction after the reduced KKT solve. + * + * The reduced solve gives `dx`; the cone equation supplies the target RHS. + * This function applies the cone block H = S^2 and writes: + * dz = cone_target - H dx. + */ +template +void recover_cone_dz_from_target(raft::device_span dx, + cone_data_t& cones, + raft::device_span cone_target, + raft::device_span dz, + rmm::cuda_stream_view stream) +{ + apply_hessian(dx, dz, cones, stream, -1, cone_target, 1); +} + +/** + * Accumulate the SOC cone-block matvec into an existing output vector. + * + * Used by matrix-free products with the primal-reduced KKT block: + * out += H x, where H = S^2. + */ +template +void accumulate_cone_hessian_matvec(raft::device_span x, + cone_data_t& cones, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + auto out_input = raft::device_span(out.data(), out.size()); + apply_hessian(x, out, cones, stream, 1, out_input, 1); +} + +template +__global__ void __launch_bounds__(soc_block_size) + scatter_hessian_into_augmented_kernel(raft::device_span augmented_x, + raft::device_span csr_indices, + raft::device_span q_values, + raft::device_span w, + raft::device_span eta, + raft::device_span cone_offsets, + raft::device_span block_offsets, + i_t n_cones, + f_t dual_perturb_value) +{ + const size_t e = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (e >= csr_indices.size()) { return; } + + i_t lo = 0; + i_t hi = n_cones; + while (lo < hi) { + const i_t mid = lo + (hi - lo) / 2; + if (block_offsets[mid + 1] <= e) { + lo = mid + 1; + } else { + hi = mid; + } + } + + const i_t cone = lo; + const size_t off = cone_offsets[cone]; + const size_t q = cone_offsets[cone + 1] - off; + const size_t blk_off = block_offsets[cone]; + const size_t local = e - blk_off; + const size_t r = local / q; + const size_t c = local % q; + + const f_t eta_sq = eta[cone] * eta[cone]; + const f_t w0 = w[off]; + const f_t u_r = (r == 0) ? w0 : w[off + r]; + const f_t u_c = (c == 0) ? w0 : w[off + c]; + f_t val = f_t{2} * u_r * eta_sq * u_c; + const f_t diag_correction = (r == 0) ? -eta_sq : eta_sq; + if (r == c) { val += diag_correction; } + + augmented_x[csr_indices[e]] = -val - q_values[e]; +} + +template +void scatter_hessian_into_augmented(const cone_data_t& cones, + rmm::device_uvector& augmented_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& q_values, + rmm::cuda_stream_view stream, + f_t dual_perturb_value) +{ + const size_t count = csr_indices.size(); + if (count == 0) { return; } + cuopt_assert(count == q_values.size(), "cone CSR index and Q-value arrays must match"); + + // TODO: This offset calculation should be done in the barrier layer, + // because it is already done in the barrier layer for the augmented system, see + // cone_block_offsets_host. + rmm::device_uvector block_offsets(cones.n_cones + 1, stream); + block_offsets.set_element_to_zero_async(0, stream); + + auto block_sizes = thrust::make_transform_iterator( + cones.cone_dimensions.begin(), [] HD(i_t q) -> size_t { return static_cast(q) * q; }); + thrust::inclusive_scan( + rmm::exec_policy(stream), block_sizes, block_sizes + cones.n_cones, block_offsets.begin() + 1); + + // TODO: use dual_perturb_value for regularization + const size_t grid = raft::ceildiv(count, soc_block_size); + scatter_hessian_into_augmented_kernel + <<>>(cuopt::make_span(augmented_x), + cuopt::make_span(csr_indices), + cuopt::make_span(q_values), + cuopt::make_span(cones.w), + cuopt::make_span(cones.eta), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(block_offsets), + cones.n_cones, + dual_perturb_value); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Compute the maximum primal and dual step lengths that keep SOC blocks + * feasible: + * + * x + alpha dx in Q, z + alpha dz in Q, alpha <= alpha_max. + * + * For one cone u + alpha du, feasibility is + * + * u_0 + alpha du_0 >= ||u_tail + alpha du_tail||. + * + * Squaring gives the quadratic + * + * c + 2 b alpha + a alpha^2 >= 0, + * + * where c = det_J(u), b = u_0 du_0 - , and + * a = det_J(du). The per-cone kernel below solves for the first boundary + * crossing, and the final reductions take the global minimum over cones. + */ +template +std::pair compute_cone_step_length(cone_data_t& cones, + raft::device_span dx, + raft::device_span dz, + f_t alpha_max, + rmm::cuda_stream_view stream) +{ + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + auto slot_0 = cones.scratch.template get_slot<0>(); + auto slot_1 = cones.scratch.template get_slot<1>(); + auto slot_2 = cones.scratch.template get_slot<2>(); + + auto run_pass = [&](raft::device_span u, + raft::device_span du, + raft::device_span alpha) { + auto du_tail_sq_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [du, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : du[idx] * du[idx]; + }); + cones.segmented_sum(du_tail_sq_terms, slot_0, stream); + + auto u_tail_du_tail_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [u, du, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : u[idx] * du[idx]; + }); + cones.segmented_sum(u_tail_du_tail_terms, slot_1, stream); + + auto u_tail_sq_terms = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [u, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : u[idx] * u[idx]; + }); + cones.segmented_sum(u_tail_sq_terms, slot_2, stream); + + const size_t grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + step_length_single_kernel<<>>( + u, du, alpha, slot_0, slot_1, slot_2, cone_offsets, alpha_max, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + }; + + auto alpha_primal = cuopt::make_span(cones.scratch.step_alpha_primal); + auto alpha_dual = cuopt::make_span(cones.scratch.step_alpha_dual); + + run_pass(cones.x, dx, alpha_primal); + run_pass(cones.z, dz, alpha_dual); + + const f_t primal = thrust::reduce(rmm::exec_policy(stream), + alpha_primal.begin(), + alpha_primal.end(), + alpha_max, + thrust::minimum()); + const f_t dual = thrust::reduce(rmm::exec_policy(stream), + alpha_dual.begin(), + alpha_dual.end(), + alpha_max, + thrust::minimum()); + + return {primal, dual}; +} + +/** + * Build the SOC corrector target for the reduced KKT solve. + * + * Mehrotra's corrector uses affine cone directions to form + * + * d = (W dx_aff) o (W^{-T} dz_aff) - sigma_mu e, + * + * where `o` is the SOC Jordan product and `e = (1, 0, ..., 0)` per cone. + * The reduced KKT solve needs the cone target + * + * q = -W * p, where p = lambda \ d and lambda = W^{-T} z. + * + * On return, `out` holds `q`. Internally, `out` is reused for `W^{-T} dz_aff` and + * then `d`; `scratch.temp_cone` is reused for `W dx_aff`, then `-p`. + */ +template +void compute_combined_cone_rhs_term(raft::device_span dx_aff, + raft::device_span dz_aff, + cone_data_t& cones, + f_t sigma_mu, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + auto cone_offsets = cuopt::make_span(cones.cone_offsets); + auto element_cone_ids = cuopt::make_span(cones.element_cone_ids); + + auto scratch_cone = cuopt::make_span(cones.scratch.temp_cone); + auto scaled_dx = raft::device_span(scratch_cone.data(), scratch_cone.size()); + auto scaled_dz = raft::device_span(out.data(), out.size()); + auto slot_0 = cones.scratch.template get_slot<0>(); + auto slot_1 = cones.scratch.template get_slot<1>(); + auto slot_2 = cones.scratch.template get_slot<2>(); + + apply_w(dx_aff, scratch_cone, cones, stream); + apply_w_inv(dz_aff, out, cones, stream); + + auto full_product_terms = thrust::make_transform_iterator( + thrust::make_zip_iterator(scaled_dx.begin(), scaled_dz.begin()), + thrust::make_zip_function([] HD(f_t dx, f_t dz) -> f_t { return dx * dz; })); + cones.segmented_sum(full_product_terms, slot_0, stream); + + // `out` currently aliases W^{-T} dz_aff and is about to be overwritten with d. + // Stage both head vectors first because every tail entry needs them. + const size_t cone_grid_dim = + raft::ceildiv(static_cast(cones.n_cones), soc_block_size); + gather_cone_heads_kernel<<>>( + scaled_dx, slot_1, cone_offsets, cones.n_cones); + gather_cone_heads_kernel<<>>( + scaled_dz, slot_2, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + const size_t element_grid_dim = raft::ceildiv(cones.n_cone_entries, soc_block_size); + combined_cone_shift_write_kernel + <<>>( + out, scaled_dx, scaled_dz, slot_0, slot_1, slot_2, cone_offsets, element_cone_ids, sigma_mu); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + auto shift = raft::device_span(out.data(), out.size()); + auto nt_point = raft::device_span(cones.lambda.data(), cones.lambda.size()); + + // compute W *(-(\lambda inv_circ shift)) + auto lambda_tail_dot_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [shift, nt_point, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : nt_point[idx] * shift[idx]; + }); + cones.segmented_sum(lambda_tail_dot_terms, slot_0, stream); + + auto lambda_tail_sq_terms = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [nt_point, cone_offsets, element_cone_ids] HD(size_t idx) -> f_t { + const i_t cone = element_cone_ids[idx]; + return idx == cone_offsets[cone] ? 0 : nt_point[idx] * nt_point[idx]; + }); + cones.segmented_sum(lambda_tail_sq_terms, slot_1, stream); + + jordan_divide_by_lambda_scalar_kernel + <<>>( + shift, nt_point, slot_0, slot_1, slot_0, slot_1, cone_offsets, cones.n_cones); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Note that we implicitly multiply by -1 here since we are writing -p. + jordan_divide_by_lambda_write_kernel + <<>>( + shift, nt_point, slot_0, slot_1, cone_offsets, element_cone_ids, scratch_cone); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + apply_w(scratch_cone, out, cones, stream); +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/barrier/second_order_cone_reduction.cuh b/cpp/src/barrier/second_order_cone_reduction.cuh new file mode 100644 index 0000000000..ada9fcfb6a --- /dev/null +++ b/cpp/src/barrier/second_order_cone_reduction.cuh @@ -0,0 +1,261 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +__global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) + warp_per_cone_reduce_kernel(InputIt input, + raft::device_span small_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init); + +/** + * Segmented-sum dispatcher for packed second-order cone vectors. + * + * Cone dimensions are fixed for a solve, so the constructor partitions cone + * ids once by reduction strategy. Each call then reuses those partitions: + * small cones use one warp per cone, medium cones use CUB DeviceSegmentedReduce, + * and large cones use CUB DeviceReduce one cone at a time. The object owns the + * CUB workspace for those medium/large paths. Call `prepare_workspace` once + * before using a CUB-backed path. + */ +template +struct segmented_sum_t { + static_assert(warp_cone_dim > 0); + static_assert(large_cone_cutoff > warp_cone_dim); + + raft::device_span cone_offsets; + rmm::device_uvector small_cone_ids; // cone dimension <= warp_cone_dim + rmm::device_uvector medium_cone_ids; // warp_cone_dim < cone dimension <= large_cone_cutoff + + std::vector large_cone_offsets; + std::vector large_cone_ids; + std::vector large_cone_dimensions; + + // Maximum CUB temporary storage needed by prepared medium/large reductions. + std::size_t cub_workspace_bytes = 0; + rmm::device_buffer cub_workspace; + + private: + template + void prepare_workspace_for_type(rmm::cuda_stream_view stream) + { + auto input = thrust::make_constant_iterator(value_t{}); + auto output = thrust::make_discard_iterator(); + + if (!medium_cone_ids.is_empty()) { + const auto medium_begin_offsets = + thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); + const auto medium_end_offsets = + thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); + + std::size_t temp_storage_bytes = 0; + RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, + temp_storage_bytes, + input, + output, + medium_cone_ids.size(), + medium_begin_offsets, + medium_end_offsets, + stream.value())); + cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); + } + + for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { + std::size_t temp_storage_bytes = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Sum(nullptr, + temp_storage_bytes, + input + large_cone_offsets[i], + output + large_cone_ids[i], + large_cone_dimensions[i], + stream.value())); + cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); + } + + if (cub_workspace.size() < cub_workspace_bytes) { + cub_workspace.resize(cub_workspace_bytes, stream); + } + } + + public: + template + void prepare_workspace(rmm::cuda_stream_view stream) + { + prepare_workspace_for_type(stream); + (prepare_workspace_for_type(stream), ...); + } + + template + void operator()(InputIt input, OutputIt output, value_t init, rmm::cuda_stream_view stream) + { + if (!small_cone_ids.is_empty()) { + // Each warp reduces one small cone. `warps_per_cta` only controls how + // many independent cone reductions are packed into one CTA; the default + // of 8 gives a conventional 256-thread block. + const auto n_small = small_cone_ids.size(); + const auto grid = (n_small + warps_per_cta - 1) / warps_per_cta; + warp_per_cone_reduce_kernel + <<>>( + input, cuopt::make_span(small_cone_ids), cone_offsets, output, init); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + if (!medium_cone_ids.is_empty()) { + cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, + "segmented_sum_t::prepare_workspace must be called before reducing medium or " + "large cones"); + + const auto medium_output = thrust::make_permutation_iterator(output, medium_cone_ids.begin()); + const auto medium_begin_offsets = + thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); + const auto medium_end_offsets = + thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); + + std::size_t temp_storage_bytes = cub_workspace_bytes; + RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(cub_workspace.data(), + temp_storage_bytes, + input, + medium_output, + medium_cone_ids.size(), + medium_begin_offsets, + medium_end_offsets, + stream.value())); + } + + if (!large_cone_ids.empty()) { + cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, + "segmented_sum_t::prepare_workspace must be called before reducing medium or " + "large cones"); + + for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { + std::size_t temp_storage_bytes = cub_workspace_bytes; + RAFT_CUDA_TRY(cub::DeviceReduce::Sum(cub_workspace.data(), + temp_storage_bytes, + input + large_cone_offsets[i], + output + large_cone_ids[i], + large_cone_dimensions[i], + stream.value())); + } + } + } + + template + void operator()(InputIt input, raft::device_span output, rmm::cuda_stream_view stream) + { + operator()(input, output.data(), f_t{0}, stream); + } + + segmented_sum_t(std::span cone_dimensions_host, + raft::device_span cone_offsets_in, + rmm::cuda_stream_view stream) + : cone_offsets(cone_offsets_in), + small_cone_ids(0, stream), + medium_cone_ids(0, stream), + cub_workspace(0, stream) + { + std::vector small_cone_ids_host; + std::vector medium_cone_ids_host; + + std::size_t cone_offset = 0; + i_t cone = 0; + for (const auto cone_dimension : cone_dimensions_host) { + if (cone_dimension <= warp_cone_dim) { + small_cone_ids_host.push_back(cone); + } else if (cone_dimension <= large_cone_cutoff) { + medium_cone_ids_host.push_back(cone); + } else { + large_cone_ids.push_back(cone); + large_cone_offsets.push_back(cone_offset); + large_cone_dimensions.push_back(cone_dimension); + } + cone_offset += cone_dimension; + ++cone; + } + + bool need_sync = false; + if (!small_cone_ids_host.empty()) { + cuopt::device_copy(small_cone_ids, small_cone_ids_host, stream); + need_sync = true; + } + if (!medium_cone_ids_host.empty()) { + cuopt::device_copy(medium_cone_ids, medium_cone_ids_host, stream); + need_sync = true; + } + if (need_sync) { stream.synchronize(); } + } +}; + +template +__global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) + warp_per_cone_reduce_kernel(InputIt input, + raft::device_span small_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init) +{ + static_assert(warps_per_cta > 0); + static_assert(warps_per_cta * raft::WarpSize <= 1024); + + using warp_reduce_t = cub::WarpReduce; + __shared__ typename warp_reduce_t::TempStorage temp_storage[warps_per_cta]; + + const auto lane_id = raft::laneId(); + const auto warp_idx = threadIdx.x / raft::WarpSize; + const auto slot = blockIdx.x * warps_per_cta + warp_idx; + if (slot >= small_cone_ids.size()) { return; } + + const auto cone = small_cone_ids[slot]; + const auto off = cone_offsets[cone]; + const auto dim = cone_offsets[cone + 1] - off; + + auto sum = init; + for (std::size_t i = lane_id; i < dim; i += raft::WarpSize) { + sum = sum + input[off + i]; + } + + sum = warp_reduce_t(temp_storage[warp_idx]).Sum(sum); + if (lane_id == 0) { output[cone] = sum; } +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/barrier/translate_soc.hpp b/cpp/src/barrier/translate_soc.hpp new file mode 100644 index 0000000000..451209bed8 --- /dev/null +++ b/cpp/src/barrier/translate_soc.hpp @@ -0,0 +1,1046 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +/** Convert MPS >= ('G') quadratic row to <= ('L') form on a working copy for SOC conversion. */ +template +void normalize_quadratic_constraint_greater_to_less(qc_t& qc) +{ + if (qc.constraint_row_type != 'G') { return; } + for (f_t& v : qc.linear_values) { + v = -v; + } + for (f_t& v : qc.vals) { + v = -v; + } + qc.rhs_value = -qc.rhs_value; + qc.constraint_row_type = 'L'; +} + +/** + * @brief Expand QCMATRIX second-order cone (and rotated / affine variants) into the + * canonical slack form expected by the simplex/PDLP path: extra variables, equality + * rows, optional cone aliases, column permutation, and `user_problem` cone metadata. + * + * Preconditions: `csr_A` and `user_problem` already reflect the linear model for `n` variables + * and original rows; this routine augments dimensions and CSR row storage in place. + */ +template +void convert_quadratic_constraints_to_second_order_cones( + i_t n, + const std::vector::quadratic_constraint_t>& + qcs, + dual_simplex::csr_matrix_t& csr_A, + dual_simplex::user_problem_t& user_problem) +{ + cuopt_expects(!qcs.empty(), + error_type_t::ValidationError, + "Quadratic-constraint flag is set, but no constraints were provided"); + + // Use a practical tolerance for text-parsed MPS numeric values. + const f_t tol = std::numeric_limits::epsilon() * 2; + + // SOC conversion accepts: + // 1) diagonal Lorentz-form QCMATRIX rows: + // -s*x_head^2 + sum_i s*x_tail_i^2 <= 0 (any common s > 0; divide by s to normalize) + // 2) rotated SOC rows: + // -2*d*x_head0*x_head1 + sum_i s*x_tail_i^2 <= 0 (d>0, s>0; canonical d=s) + // symmetric Q off-diagonals (-d,-d) give x^T Q x cross term -2*d*x0*x1, i.e. a*x0*x1 + // in the inequality 2*d*x0*x1 >= s*||tail||^2 with a = 2*d. Lift uses sqrt(d/s) on heads. + // 3) quadratic rows with linear part: + // sum_i s*x_tail_i^2 + a^T x <= 0 + // represented as diagonal +s QCMATRIX entries plus linear terms in COLUMNS. + // We introduce an auxiliary t = -(1/s)*a^T x so the row becomes: + // sum_i x_tail_i^2 - t <= 0 + // then lift it as rotated SOC with implicit second head fixed at 1/2. + // The barrier consumes SOCs as trailing variable blocks [head, tails...], so we validate all + // QCMATRIX blocks first, convert rotated cones via slack variables in standard SOC coordinates, + // then apply a single column permutation to the linear model. + struct rotated_soc_t { + i_t head0{}; + i_t head1{}; + std::vector tails{}; + bool head1_is_constant_half{false}; + /// For two-head rotated SOC: sqrt(d/s) where Q_off = -d and tail diagonals +s (canonical 1). + f_t head_lift_sqrt_ratio{1}; + }; + // This is the index of the auxiliary variable for the linear part of the quadratic constraint. + std::vector qc_affine_heads(qcs.size(), -1); + i_t n_affine_linear_aux = 0; + for (size_t qc_i = 0; qc_i < qcs.size(); ++qc_i) { + if (!qcs[qc_i].linear_values.empty()) { + qc_affine_heads[qc_i] = static_cast(n + n_affine_linear_aux); + ++n_affine_linear_aux; + } + } + + const i_t n_with_affine_aux = static_cast(n + n_affine_linear_aux); + + std::vector> cone_vars; + std::vector cone_dims; + std::vector cone_is_rotated; + std::vector rotated_cones; + std::vector is_cone_var(n_with_affine_aux, 0); + cone_vars.reserve(qcs.size()); + cone_dims.reserve(qcs.size()); + cone_is_rotated.reserve(qcs.size()); + rotated_cones.reserve(qcs.size()); + std::vector qc_soc_uniform_scale(qcs.size(), 1); + + for (size_t qc_i = 0; qc_i < qcs.size(); ++qc_i) { + auto qc = qcs[qc_i]; + cuopt_expects(qc.constraint_row_type != 'E', + error_type_t::ValidationError, + "Equality quadratic constraints are not supported for SOC conversion"); + cuopt_expects(qc.constraint_row_type == 'L' || qc.constraint_row_type == 'G', + error_type_t::ValidationError, + "Quadratic constraint '%s' ROWS type must be 'L' (<=) or 'G' (>=)", + qc.constraint_row_name.c_str()); + normalize_quadratic_constraint_greater_to_less(qc); + cuopt_expects((qc.rhs_value < tol) && (qc.rhs_value > -tol), + error_type_t::ValidationError, + "SOC conversion currently requires rhs = 0 for quadratic constraints"); + cuopt_expects(qc.linear_values.size() == qc.linear_indices.size(), + error_type_t::ValidationError, + "Quadratic constraint '%s' linear_values and linear_indices length mismatch", + qc.constraint_row_name.c_str()); + + const i_t q_nnz = static_cast(qc.vals.size()); + cuopt_expects( + qc.rows.size() == static_cast(q_nnz) && qc.cols.size() == static_cast(q_nnz), + error_type_t::ValidationError, + "Quadratic constraint '%s' Q COO row/col/value length mismatch", + qc.constraint_row_name.c_str()); + cuopt_expects(q_nnz >= 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC must have at least 1 entry in Q (nnz %d)", + qc.constraint_row_name.c_str(), + static_cast(q_nnz)); + + // This is the index of the auxiliary variable for the linear part of the quadratic + // constraint. + const i_t affine_head = qc_affine_heads[qc_i]; + const bool has_linear_part = affine_head >= 0; + if (has_linear_part) { + size_t nonzero_terms = 0; + for (size_t p = 0; p < qc.linear_values.size(); ++p) { + const i_t idx = qc.linear_indices[p]; + const f_t v = qc.linear_values[p]; + cuopt_expects(idx >= 0 && idx < n, + error_type_t::ValidationError, + "Quadratic constraint '%s' linear index %d is outside [0, %d)", + qc.constraint_row_name.c_str(), + static_cast(idx), + static_cast(n)); + if (v > -tol && v < tol) { continue; } + ++nonzero_terms; + } + cuopt_expects(nonzero_terms > 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' has linear section but all linear coefficients are " + "zero", + qc.constraint_row_name.c_str()); + } + + // Verify Q as either: + // - standard SOC: one diagonal -s (head), tail diagonals +s for a common s > 0, + // - rotated SOC: symmetric (-s,-s) off-diagonal pair on the two heads, tails +s, + // - affine SOC: tail diagonals +s and linear terms (no Q off-diagonals). + // Feasibility is unchanged after dividing the quadratic row by s; affine rows also scale + // linear coefficients when forming the auxiliary t = -(1/s) a^T x. + + auto approx_eq_scaled = [&](f_t a, f_t b) { + const f_t scale = std::max({f_t(1), std::abs(a), std::abs(b)}); + return std::abs(a - b) <= tol * scale; + }; + + // Sort COO by (row, col); O(nnz log nnz). Enforce at most one stored entry per row (SOC CSR). + std::vector perm(q_nnz); + std::iota(perm.begin(), perm.end(), size_t{0}); + std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) { + const i_t ra = qc.rows[a]; + const i_t rb = qc.rows[b]; + if (ra != rb) { return ra < rb; } + return qc.cols[a] < qc.cols[b]; + }); + + std::vector> q_entries; + q_entries.reserve(q_nnz); + for (size_t t = 0; t < static_cast(q_nnz); ++t) { + const size_t ix = perm[t]; + const i_t r = qc.rows[ix]; + const i_t c = qc.cols[ix]; + const f_t v = qc.vals[ix]; + cuopt_expects(r >= 0 && r < n && c >= 0 && c < n, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q entry (%d,%d) outside [0,%d)", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(c), + static_cast(n)); + if (!q_entries.empty()) { + const i_t prev_r = std::get<0>(q_entries.back()); + cuopt_expects(r != prev_r, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: expected at most one stored entry per " + "row (CSR layout); duplicate or unsorted row in COO", + qc.constraint_row_name.c_str(), + static_cast(r)); + } + q_entries.emplace_back(r, c, v); + } + + std::vector> pos_diag_rows; + std::vector> neg_diag_rows; + std::vector> offdiag_entries; + pos_diag_rows.reserve(q_entries.size()); + neg_diag_rows.reserve(1); + offdiag_entries.reserve(4); + + for (const auto& [r, c, v] : q_entries) { + if (r == c) { + if (v > tol) { + pos_diag_rows.emplace_back(r, v); + } else if (v < -tol) { + neg_diag_rows.emplace_back(r, v); + } else { + cuopt_expects(false, + error_type_t::ValidationError, + "Quadratic constraint '%s' Q row %d: diagonal SOC entry is near zero " + "(%.17g)", + qc.constraint_row_name.c_str(), + static_cast(r), + static_cast(v)); + } + } else { + offdiag_entries.emplace_back(r, c, v); + } + } + + std::vector tail_vars; + tail_vars.reserve(pos_diag_rows.size()); + for (const std::pair& pr : pos_diag_rows) { + tail_vars.push_back(pr.first); + } + + f_t uniform_s = 0; + bool have_uniform_s = false; + auto note_positive_s = [&](f_t v) { + cuopt_expects(v > tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected strictly positive diagonal tail " + "coefficient, got %.17g", + qc.constraint_row_name.c_str(), + static_cast(v)); + if (!have_uniform_s) { + uniform_s = v; + have_uniform_s = true; + } else { + cuopt_expects( + approx_eq_scaled(v, uniform_s), + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: all positive diagonal coefficients must match; got " + "%.17g vs %.17g", + qc.constraint_row_name.c_str(), + static_cast(v), + static_cast(uniform_s)); + } + }; + + std::vector cone; + i_t cone_dim = 0; + char is_rotated = 0; + i_t head = -1; + + if (offdiag_entries.empty()) { + if (!has_linear_part) { + if (pos_diag_rows.empty()) { + cuopt_expects(neg_diag_rows.size() == 1 && q_nnz == 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected tail diagonals +s with head -s, " + "or a single head row with q_nnz=1", + qc.constraint_row_name.c_str()); + const f_t neg_v = neg_diag_rows[0].second; + cuopt_expects(neg_v < -tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: cone head diagonal must be negative " + "(%.17g)", + qc.constraint_row_name.c_str(), + static_cast(neg_v)); + uniform_s = -neg_v; + have_uniform_s = true; + head = neg_diag_rows[0].first; + cuopt_expects( + static_cast(tail_vars.size()) == q_nnz - 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected %d diagonal +s entries (tails), found %zu", + qc.constraint_row_name.c_str(), + static_cast(q_nnz - 1), + tail_vars.size()); + cone.reserve(1); + cone.push_back(head); + cone_dim = static_cast(cone.size()); + is_rotated = 0; + } else { + for (const std::pair& pr : pos_diag_rows) { + note_positive_s(pr.second); + } + cuopt_expects(have_uniform_s, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: could not infer uniform positive scale s", + qc.constraint_row_name.c_str()); + cuopt_expects( + neg_diag_rows.size() == 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected exactly one diagonal -s (cone head) for " + "%zu tail entries, found %zu negative diagonals", + qc.constraint_row_name.c_str(), + tail_vars.size(), + neg_diag_rows.size()); + cuopt_expects( + static_cast(tail_vars.size()) == q_nnz - 1, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: expected %d diagonal +s entries (tails), found %zu", + qc.constraint_row_name.c_str(), + static_cast(q_nnz - 1), + tail_vars.size()); + const f_t neg_v = neg_diag_rows[0].second; + cuopt_expects( + approx_eq_scaled(neg_v, -uniform_s), + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: cone head diagonal must be -s with the same s as " + "positive tail diagonals; head %.17g vs -s = %.17g", + qc.constraint_row_name.c_str(), + static_cast(neg_v), + static_cast(-uniform_s)); + head = neg_diag_rows[0].first; + cone.reserve(q_nnz); + cone.push_back(head); + cone.insert(cone.end(), tail_vars.begin(), tail_vars.end()); + cone_dim = static_cast(cone.size()); + is_rotated = 0; + } + } else { + cuopt_expects( + neg_diag_rows.empty(), + error_type_t::ValidationError, + "Quadratic constraint '%s' with linear terms cannot contain negative diagonal " + "Q entries", + qc.constraint_row_name.c_str()); + cuopt_expects(affine_head >= 0, + error_type_t::ValidationError, + "Quadratic constraint '%s' internal error: affine SOC head index invalid", + qc.constraint_row_name.c_str()); + for (const std::pair& pr : pos_diag_rows) { + note_positive_s(pr.second); + } + cuopt_expects(have_uniform_s, + error_type_t::ValidationError, + "Quadratic constraint '%s' with linear terms must have at least one " + "diagonal +s term in Q", + qc.constraint_row_name.c_str()); + cuopt_expects(!tail_vars.empty(), + error_type_t::ValidationError, + "Quadratic constraint '%s' with linear terms must have at least one " + "diagonal +s term in Q", + qc.constraint_row_name.c_str()); + for (const i_t tail : tail_vars) { + cuopt_expects( + tail != affine_head, + error_type_t::ValidationError, + "Quadratic constraint '%s' with linear terms requires the linear head variable to be " + "distinct from quadratic diagonal variables", + qc.constraint_row_name.c_str()); + } + + cone.reserve(tail_vars.size() + 1); + cone.push_back(affine_head); + cone.insert(cone.end(), tail_vars.begin(), tail_vars.end()); + cone_dim = static_cast(tail_vars.size() + 2); + is_rotated = 1; + rotated_cones.push_back(rotated_soc_t{affine_head, -1, tail_vars, true, 1}); + } + } else { + cuopt_expects(!has_linear_part, + error_type_t::ValidationError, + "Quadratic constraint '%s' with linear terms cannot include rotated-SOC " + "off-diagonal entries", + qc.constraint_row_name.c_str()); + cuopt_expects(neg_diag_rows.empty(), + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q cannot contain diagonal head " + "entries; found %zu negative diagonals", + qc.constraint_row_name.c_str(), + neg_diag_rows.size()); + for (const std::pair& pr : pos_diag_rows) { + note_positive_s(pr.second); + } + cuopt_expects(have_uniform_s, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q: could not infer uniform scale s", + qc.constraint_row_name.c_str()); + cuopt_expects( + offdiag_entries.size() == 2, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q must contain exactly one symmetric off-diagonal " + "pair (-d,-d); found %zu off-diagonal entries", + qc.constraint_row_name.c_str(), + offdiag_entries.size()); + + const i_t a = std::get<0>(offdiag_entries[0]); + const i_t b = std::get<1>(offdiag_entries[0]); + const f_t v0 = std::get<2>(offdiag_entries[0]); + cuopt_expects( + v0 < -tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q off-diagonal must be negative; got %.17g", + qc.constraint_row_name.c_str(), + static_cast(v0)); + cuopt_expects(a != b, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q off-diagonal pair must use distinct " + "variables", + qc.constraint_row_name.c_str()); + cuopt_expects(std::get<0>(offdiag_entries[1]) == b && std::get<1>(offdiag_entries[1]) == a, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q must have symmetric entries (a,b) " + "and (b,a) with the same value", + qc.constraint_row_name.c_str()); + const f_t v1 = std::get<2>(offdiag_entries[1]); + cuopt_expects( + v1 < -tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q off-diagonal must be negative; got %.17g", + qc.constraint_row_name.c_str(), + static_cast(v1)); + cuopt_expects( + approx_eq_scaled(v0, v1), + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q symmetric off-diagonals must match; got %.17g " + "and %.17g", + qc.constraint_row_name.c_str(), + static_cast(v0), + static_cast(v1)); + const f_t cross_d = -v0; + cuopt_expects( + cross_d > tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q cross coefficient d = -Q_off must be positive", + qc.constraint_row_name.c_str()); + const f_t head_lift_sqrt_ratio = std::sqrt(cross_d / uniform_s); + cuopt_expects(std::isfinite(static_cast(head_lift_sqrt_ratio)), + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q head lift ratio sqrt(d/s) is not " + "finite (d=%.17g, s=%.17g)", + qc.constraint_row_name.c_str(), + static_cast(cross_d), + static_cast(uniform_s)); + cuopt_expects(static_cast(tail_vars.size()) == q_nnz - 2, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q: expected %d diagonal +s entries " + "(tails), found %zu", + qc.constraint_row_name.c_str(), + static_cast(q_nnz - 2), + tail_vars.size()); + cuopt_expects(q_nnz >= 3, + error_type_t::ValidationError, + "Quadratic constraint '%s' rotated SOC Q must have at least 1 tail entry", + qc.constraint_row_name.c_str()); + + cone.reserve(q_nnz); + cone.push_back(a); + cone.push_back(b); + cone.insert(cone.end(), tail_vars.begin(), tail_vars.end()); + cone_dim = static_cast(cone.size()); + is_rotated = 1; + rotated_cones.push_back(rotated_soc_t{a, b, tail_vars, false, head_lift_sqrt_ratio}); + } + + cuopt_expects(have_uniform_s && uniform_s > tol, + error_type_t::ValidationError, + "Quadratic constraint '%s' SOC Q: uniform scale s must be positive (got %.17g)", + qc.constraint_row_name.c_str(), + static_cast(uniform_s)); + qc_soc_uniform_scale[qc_i] = uniform_s; + + for (const i_t var : cone) { + cuopt_expects(var >= 0 && var < static_cast(is_cone_var.size()), + error_type_t::ValidationError, + "SOC variable index %d is outside [0, %zu)", + static_cast(var), + is_cone_var.size()); + } + cone_dims.push_back(cone_dim); + cone_vars.push_back(std::move(cone)); + cone_is_rotated.push_back(is_rotated); + } + // Add affine linear auxiliary variables and linking rows. + if (n_affine_linear_aux > 0) { + const f_t inf = std::numeric_limits::infinity(); + const i_t n_old = static_cast(n); + const i_t n_aug = n_with_affine_aux; + const i_t m_old = csr_A.m; + const i_t m_aug = static_cast(m_old + n_affine_linear_aux); + i_t row_write_cursor = m_old; + + user_problem.objective.resize(n_aug, 0); + user_problem.lower.resize(n_aug, -inf); + user_problem.upper.resize(n_aug, inf); + user_problem.var_types.resize( + n_aug, cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS); + if (!user_problem.col_names.empty()) { user_problem.col_names.resize(n_aug); } + + for (size_t qc_i = 0; qc_i < qcs.size(); ++qc_i) { + const i_t aux_j = qc_affine_heads[qc_i]; + if (aux_j < 0) { continue; } + user_problem.lower[aux_j] = 0; + user_problem.upper[aux_j] = inf; + if (!user_problem.col_names.empty()) { + user_problem.col_names[aux_j] = "_CUOPT_qc_linear_aux_" + std::to_string(aux_j - n_old); + } + } + + user_problem.rhs.resize(m_aug); + user_problem.row_sense.resize(m_aug); + if (!user_problem.row_names.empty()) { user_problem.row_names.resize(m_aug); } + + csr_A.n = n_aug; + dual_simplex::sparse_vector_t eq_row; + eq_row.n = n_aug; + + for (size_t qc_i = 0; qc_i < qcs.size(); ++qc_i) { + const i_t aux_j = qc_affine_heads[qc_i]; + if (aux_j < 0) { continue; } + const auto& qc = qcs[qc_i]; + eq_row.i.clear(); + eq_row.x.clear(); + // Define auxiliary as t = -(1/s) a^T x so QC linear part matches normalized cone row. + const f_t inv_s = 1 / qc_soc_uniform_scale[qc_i]; + eq_row.i.push_back(aux_j); + eq_row.x.push_back(1); + for (size_t p = 0; p < qc.linear_values.size(); ++p) { + const f_t v = qc.linear_values[p]; + if (v > -tol && v < tol) { continue; } + eq_row.i.push_back(qc.linear_indices[p]); + eq_row.x.push_back(v * inv_s); + } + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_write_cursor] = 'E'; + user_problem.rhs[row_write_cursor] = 0; + if (!user_problem.row_names.empty()) { + user_problem.row_names[row_write_cursor] = + "_CUOPT_qc_linear_link_" + qc.constraint_row_name; + } + ++row_write_cursor; + } + + cuopt_expects(row_write_cursor == m_aug, + error_type_t::RuntimeError, + "Internal error: affine QC linking row count mismatch"); + cuopt_expects(csr_A.m == m_aug, + error_type_t::RuntimeError, + "Internal error: CSR row count after affine QC linking"); + } + + i_t n_prob = n_with_affine_aux; + + // Convert rotated SOC cones to standard SOC cones. + if (!rotated_cones.empty()) { + cuopt_expects(user_problem.Q_values.empty(), + error_type_t::ValidationError, + "Rotated SOC conversion is currently not supported when the objective has " + "quadratic terms"); + + const f_t inf = std::numeric_limits::infinity(); + const f_t inv_sqrt_2 = f_t(1) / std::sqrt(f_t(2)); + const f_t half = f_t(0.5); + + for (const rotated_soc_t& rc : rotated_cones) { + cuopt_expects(user_problem.var_types[rc.head0] == + cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS, + error_type_t::ValidationError, + "Rotated SOC head variables must be continuous"); + if (!rc.head1_is_constant_half) { + cuopt_expects(user_problem.var_types[rc.head1] == + cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS, + error_type_t::ValidationError, + "Rotated SOC head variables must be continuous"); + } + for (const i_t t : rc.tails) { + cuopt_expects(user_problem.var_types[t] == + cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS, + error_type_t::ValidationError, + "Rotated SOC tail variables must be continuous"); + } + } + + // Lift each rotated cone into standard SOC coordinates with two slacks: + // With x_i' = sqrt(d/s)*x_hi, canonical s0 = (x_0'+x_1')/sqrt(2), s1 = (x_0'-x_1')/sqrt(2) + // so 2*d*x_h0*x_h1 >= s*sum tail^2 <=> 2*x_0'*x_1' >= sum (x_tail)^2 => s0^2 >= s1^2 + + // ... Only the rotated heads are replaced by slacks; tails stay as original variables. + i_t n_slack_total = 0; + for (size_t ci = 0; ci < cone_is_rotated.size(); ++ci) { + if (cone_is_rotated[ci]) { n_slack_total += 2; } + } + + const i_t n_old = n_prob; + n_prob = static_cast(n_old + n_slack_total); + + user_problem.objective.resize(n_prob, 0); + user_problem.lower.resize(n_prob, -inf); + user_problem.upper.resize(n_prob, inf); + user_problem.var_types.resize( + n_prob, cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS); + if (!user_problem.col_names.empty()) { + user_problem.col_names.resize(n_prob); + for (i_t j = n_old; j < n_prob; ++j) { + user_problem.col_names[j] = "_CUOPT_rsoc_slack_" + std::to_string(j - n_old); + } + } + + is_cone_var.resize(n_prob, 0); + + const i_t m_old = csr_A.m; + user_problem.rhs.resize(m_old + n_slack_total); + user_problem.row_sense.resize(m_old + n_slack_total); + if (!user_problem.row_names.empty()) { + user_problem.row_names.resize(m_old + n_slack_total); + for (i_t r = m_old; r < m_old + n_slack_total; ++r) { + user_problem.row_names[r] = "_CUOPT_rsoc_lift_" + std::to_string(r - m_old); + } + } + + csr_A.n = n_prob; + + dual_simplex::sparse_vector_t eq_row; + size_t ri = 0; + i_t slack_base = n_old; + i_t row_idx = m_old; + + for (size_t ci = 0; ci < cone_vars.size(); ++ci) { + if (!cone_is_rotated[ci]) { continue; } + const rotated_soc_t& rc = rotated_cones[ri++]; + const i_t dim = cone_dims[ci]; + std::vector new_cone; + new_cone.reserve(dim); + new_cone.push_back(slack_base); + new_cone.push_back(slack_base + 1); + new_cone.insert(new_cone.end(), rc.tails.begin(), rc.tails.end()); + cone_vars[ci] = std::move(new_cone); + + is_cone_var[slack_base] = 1; + is_cone_var[slack_base + 1] = 1; + + eq_row.n = n_prob; + // If the second head is not constant half, we need to lift it. + if (!rc.head1_is_constant_half) { + const f_t h = inv_sqrt_2 * rc.head_lift_sqrt_ratio; + // s_0 - h * x_h0 - h * x_h1 = 0 (h = inv_sqrt_2 * sqrt(d/s)) + eq_row.i = {rc.head0, rc.head1, slack_base}; + eq_row.x = {-h, -h, f_t(1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = 0; + ++row_idx; + + // s_1 - h * x_h0 + h * x_h1 = 0 + eq_row.i = {rc.head0, rc.head1, slack_base + 1}; + eq_row.x = {-h, h, f_t(1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = 0; + ++row_idx; + + is_cone_var[rc.head0] = 0; + is_cone_var[rc.head1] = 0; + } else { + // One head is constant half, so we can lift it directly. + // s_0 - inv_sqrt_2 * x_h0 = inv_sqrt_2 * (1/2) + eq_row.i = {rc.head0, slack_base}; + eq_row.x = {-inv_sqrt_2, f_t(1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = inv_sqrt_2 * half; + ++row_idx; + + // s_1 - inv_sqrt_2 * x_h0 = -inv_sqrt_2 * (1/2) + eq_row.i = {rc.head0, slack_base + 1}; + eq_row.x = {-inv_sqrt_2, f_t(1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = -inv_sqrt_2 * half; + ++row_idx; + + is_cone_var[rc.head0] = 0; + } + + slack_base += 2; + } + + cuopt_expects(ri == rotated_cones.size(), + error_type_t::RuntimeError, + "Internal error: rotated SOC cone metadata mismatch"); + cuopt_expects(slack_base == n_prob, + error_type_t::RuntimeError, + "Internal error: slack variable count mismatch"); + cuopt_expects(row_idx == m_old + n_slack_total, + error_type_t::RuntimeError, + "Internal error: rotated SOC equality row count mismatch"); + cuopt_expects(csr_A.m == m_old + n_slack_total, + error_type_t::RuntimeError, + "Internal error: CSR row count after rotated SOC lift"); + } + + // If a variable appears in multiple cones, create per-cone aliases and add linking rows + // alias - original = 0 so cone variable blocks are disjoint. + { + std::vector first_owner(n_prob, -1); + std::vector> cone_alias_pairs; // (alias, original) + + for (size_t ci = 0; ci < cone_vars.size(); ++ci) { + std::vector& cone = cone_vars[ci]; + for (i_t& var : cone) { + cuopt_expects(var >= 0 && var < n_prob, + error_type_t::ValidationError, + "SOC variable index %d is outside [0, %d)", + static_cast(var), + static_cast(n_prob)); + if (first_owner[var] == -1) { + first_owner[var] = static_cast(ci); + continue; + } + if (first_owner[var] != static_cast(ci)) { + const i_t alias = static_cast(n_prob + cone_alias_pairs.size()); + cone_alias_pairs.emplace_back(alias, var); + var = alias; + } + } + } + + if (!cone_alias_pairs.empty()) { + const i_t n_old = n_prob; + const i_t n_new = static_cast(n_old + cone_alias_pairs.size()); + const i_t m_old = csr_A.m; + const i_t m_new = static_cast(m_old + cone_alias_pairs.size()); + + user_problem.objective.resize(n_new, 0); + user_problem.lower.resize(n_new, -std::numeric_limits::infinity()); + user_problem.upper.resize(n_new, std::numeric_limits::infinity()); + user_problem.var_types.resize( + n_new, cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS); + if (!user_problem.col_names.empty()) { user_problem.col_names.resize(n_new); } + + for (const auto& [alias, original] : cone_alias_pairs) { + // Cone copies are not box-constrained; linking rows tie them to the linear original. + user_problem.lower[alias] = -std::numeric_limits::infinity(); + user_problem.upper[alias] = std::numeric_limits::infinity(); + user_problem.var_types[alias] = user_problem.var_types[original]; + // Keep objective unchanged: alias coefficient stays zero and alias==original links + // values. + if (!user_problem.col_names.empty()) { + user_problem.col_names[alias] = "_CUOPT_cone_alias_" + std::to_string(alias - n_old); + } + } + + user_problem.rhs.resize(m_new); + user_problem.row_sense.resize(m_new); + if (!user_problem.row_names.empty()) { user_problem.row_names.resize(m_new); } + + csr_A.n = n_new; + dual_simplex::sparse_vector_t eq_row; + eq_row.n = n_new; + i_t row_idx = m_old; + for (const auto& [alias, original] : cone_alias_pairs) { + eq_row.i = {alias, original}; + eq_row.x = {f_t(1), f_t(-1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = 0; + if (!user_problem.row_names.empty()) { + user_problem.row_names[row_idx] = + "_CUOPT_cone_alias_link_" + std::to_string(row_idx - m_old); + } + ++row_idx; + } + + cuopt_expects(row_idx == m_new, + error_type_t::RuntimeError, + "Internal error: cone alias linking row count mismatch"); + cuopt_expects(csr_A.m == m_new, + error_type_t::RuntimeError, + "Internal error: CSR row count after cone alias linking"); + + n_prob = n_new; + } + } + + // Bounded cone participants cannot sit in the cone block: + // introduce a free cone copy and alias - original = 0 so the original keeps its bounds + // in the linear block while the barrier sees an unconstrained cone variable. + // Exception: cone heads with lower = 0 need no split because cone membership + // already implies x_0 >= ||x_tail|| >= 0. + { + const f_t neg_inf = -std::numeric_limits::infinity(); + const f_t pos_inf = std::numeric_limits::infinity(); + std::vector> bound_split_pairs; // (cone_alias, linear_original) + + for (std::vector& cone : cone_vars) { + for (size_t idx = 0; idx < cone.size(); idx++) { + i_t& var = cone[idx]; + cuopt_expects(var >= 0 && var < n_prob, + error_type_t::ValidationError, + "SOC variable index %d is outside [0, %d)", + static_cast(var), + static_cast(n_prob)); + if (user_problem.lower[var] == neg_inf && user_problem.upper[var] == pos_inf) { continue; } + // Cone heads with lower = 0 need no split: cone membership implies x_0 >= ||x_tail|| >= 0. + if (idx == 0 && user_problem.lower[var] == 0 && user_problem.upper[var] == pos_inf) { + continue; + } + const i_t alias = static_cast(n_prob + bound_split_pairs.size()); + bound_split_pairs.emplace_back(alias, var); + var = alias; + } + } + + if (!bound_split_pairs.empty()) { + const i_t n_old = n_prob; + const i_t n_new = static_cast(n_old + bound_split_pairs.size()); + const i_t m_old = csr_A.m; + const i_t m_new = static_cast(m_old + bound_split_pairs.size()); + + user_problem.objective.resize(n_new, 0); + user_problem.lower.resize(n_new, neg_inf); + user_problem.upper.resize(n_new, pos_inf); + user_problem.var_types.resize( + n_new, cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS); + if (!user_problem.col_names.empty()) { user_problem.col_names.resize(n_new); } + + for (const auto& [alias, original] : bound_split_pairs) { + user_problem.var_types[alias] = user_problem.var_types[original]; + if (!user_problem.col_names.empty()) { + user_problem.col_names[alias] = + "_CUOPT_cone_bound_split_" + std::to_string(alias - n_old); + } + } + + user_problem.rhs.resize(m_new); + user_problem.row_sense.resize(m_new); + if (!user_problem.row_names.empty()) { user_problem.row_names.resize(m_new); } + + csr_A.n = n_new; + dual_simplex::sparse_vector_t eq_row; + eq_row.n = n_new; + i_t row_idx = m_old; + for (const auto& [alias, original] : bound_split_pairs) { + eq_row.i = {alias, original}; + eq_row.x = {f_t(1), f_t(-1)}; + eq_row.sort(); + csr_A.append_row(eq_row); + user_problem.row_sense[row_idx] = 'E'; + user_problem.rhs[row_idx] = 0; + if (!user_problem.row_names.empty()) { + user_problem.row_names[row_idx] = + "_CUOPT_cone_bound_split_link_" + std::to_string(row_idx - m_old); + } + ++row_idx; + } + + cuopt_expects(row_idx == m_new, + error_type_t::RuntimeError, + "Internal error: cone bound-split linking row count mismatch"); + cuopt_expects(csr_A.m == m_new, + error_type_t::RuntimeError, + "Internal error: CSR row count after cone bound-split linking"); + + n_prob = n_new; + } + } + + is_cone_var.assign(n_prob, 0); + for (const std::vector& cone : cone_vars) { + for (const i_t var : cone) { + cuopt_expects(var >= 0 && var < n_prob, + error_type_t::ValidationError, + "SOC variable index %d is outside [0, %d) after cone aliasing", + static_cast(var), + static_cast(n_prob)); + is_cone_var[var] = 1; + } + } + + std::vector old_to_new(n_prob, i_t{-1}); + std::vector new_to_old; + new_to_old.reserve(n_prob); + for (i_t j = 0; j < n_prob; ++j) { + if (is_cone_var[j]) { continue; } + old_to_new[j] = static_cast(new_to_old.size()); + new_to_old.push_back(j); + } + const i_t cone_var_start = static_cast(new_to_old.size()); + for (const std::vector& cone : cone_vars) { + for (const i_t old_j : cone) { + old_to_new[old_j] = static_cast(new_to_old.size()); + new_to_old.push_back(old_j); + } + } + cuopt_expects(static_cast(new_to_old.size()) == n_prob, + error_type_t::RuntimeError, + "Internal error while building SOC variable permutation"); + + for (i_t row = 0; row < csr_A.m; ++row) { + for (i_t p = csr_A.row_start[row]; p < csr_A.row_start[row + 1]; ++p) { + const i_t old_j = csr_A.j[p]; + cuopt_expects(old_j >= 0 && old_j < n_prob, + error_type_t::ValidationError, + "Linear constraint matrix column index %d is outside [0, %d)", + static_cast(old_j), + static_cast(n_prob)); + csr_A.j[p] = old_to_new[old_j]; + } + } + + auto permute_dense_by_old_to_new = [&](auto& values, const char* name) { + if (values.empty()) { return; } + using value_t = typename std::decay_t::value_type; + cuopt_expects(values.size() == static_cast(n_prob), + error_type_t::ValidationError, + "%s length %zu does not match number of variables %d", + name, + values.size(), + static_cast(n_prob)); + std::vector permuted(values.size()); + for (i_t old_j = 0; old_j < n_prob; ++old_j) { + permuted[old_to_new[old_j]] = std::move(values[old_j]); + } + values = std::move(permuted); + }; + + permute_dense_by_old_to_new(user_problem.objective, "objective"); + permute_dense_by_old_to_new(user_problem.lower, "lower bounds"); + permute_dense_by_old_to_new(user_problem.upper, "upper bounds"); + permute_dense_by_old_to_new(user_problem.var_types, "variable types"); + permute_dense_by_old_to_new(user_problem.col_names, "column names"); + + if (!user_problem.Q_values.empty()) { + const i_t n_model = static_cast(n); + cuopt_expects(user_problem.Q_indices.size() == user_problem.Q_values.size(), + error_type_t::ValidationError, + "Quadratic objective indices and values length mismatch"); + cuopt_expects(user_problem.Q_offsets.size() == static_cast(n_model) + 1, + error_type_t::ValidationError, + "Quadratic objective CSR offsets length must be n+1 when SOC QCMATRIX " + "conversion permutes variables"); + cuopt_expects(user_problem.Q_offsets[0] == 0, + error_type_t::ValidationError, + "Quadratic objective CSR offsets[0] must be 0"); + cuopt_expects(user_problem.Q_offsets[n_model] == static_cast(user_problem.Q_values.size()), + error_type_t::ValidationError, + "Quadratic objective CSR last offset must equal number of nonzeros"); + + std::vector q_offsets(n_prob + 1, 0); + for (i_t old_row = 0; old_row < n_model; ++old_row) { + const i_t p_beg = user_problem.Q_offsets[old_row]; + const i_t p_end = user_problem.Q_offsets[old_row + 1]; + cuopt_expects( + p_beg >= 0 && p_beg <= p_end && p_end <= static_cast(user_problem.Q_values.size()), + error_type_t::ValidationError, + "Quadratic objective CSR offsets are invalid at row %d", + static_cast(old_row)); + const i_t new_row = old_to_new[old_row]; + q_offsets[new_row + 1] = p_end - p_beg; + } + for (i_t row = 0; row < n_prob; ++row) { + q_offsets[row + 1] += q_offsets[row]; + } + + std::vector q_indices(user_problem.Q_values.size()); + std::vector q_values(user_problem.Q_values.size()); + std::vector q_write = q_offsets; + for (i_t old_row = 0; old_row < n_model; ++old_row) { + const i_t new_row = old_to_new[old_row]; + for (i_t p = user_problem.Q_offsets[old_row]; p < user_problem.Q_offsets[old_row + 1]; ++p) { + const i_t old_col = user_problem.Q_indices[p]; + cuopt_expects(old_col >= 0 && old_col < n_model, + error_type_t::ValidationError, + "Quadratic objective column index %d is outside [0, %d)", + static_cast(old_col), + static_cast(n_model)); + const i_t dst = q_write[new_row]++; + q_indices[dst] = old_to_new[old_col]; + q_values[dst] = user_problem.Q_values[p]; + } + } + + user_problem.Q_offsets = std::move(q_offsets); + user_problem.Q_indices = std::move(q_indices); + user_problem.Q_values = std::move(q_values); + } + + user_problem.cone_var_start = cone_var_start; + user_problem.second_order_cone_dims = std::move(cone_dims); + user_problem.num_rows = csr_A.m; + user_problem.num_cols = n_prob; + + user_problem.original_num_cols = static_cast(n); + user_problem.original_col_to_expanded_col.resize(n); + for (i_t old_j = 0; old_j < static_cast(n); ++old_j) { + user_problem.original_col_to_expanded_col[old_j] = old_to_new[old_j]; + } +} + +/** Map barrier primal/reduced-cost vectors from expanded SOC layout back to original model columns. + */ +template +void project_barrier_solution_to_model_variables( + const dual_simplex::user_problem_t& user_problem, + dual_simplex::lp_solution_t& solution) +{ + const i_t n_original = user_problem.original_num_cols; + if (n_original <= 0) { return; } + if (static_cast(user_problem.original_col_to_expanded_col.size()) != n_original) { return; } + + std::vector model_x(n_original); + std::vector model_z(n_original); + for (i_t j = 0; j < n_original; ++j) { + const i_t expanded_j = user_problem.original_col_to_expanded_col[j]; + model_x[j] = solution.x[expanded_j]; + model_z[j] = solution.z[expanded_j]; + } + const i_t m = static_cast(solution.y.size()); + solution.resize(m, n_original); + solution.x = std::move(model_x); + solution.z = std::move(model_z); +} + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index 7d7eedc7c1..d438fd133b 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -16,10 +16,16 @@ #include #include #include -#include namespace cuopt::linear_programming::dual_simplex { +template +/** Number of leading linear columns; SOCP cone variables occupy [linear_cols, num_cols). */ +static i_t linear_variable_count(const lp_problem_t& problem) +{ + return problem.second_order_cone_dims.empty() ? problem.num_cols : problem.cone_var_start; +} + template i_t remove_empty_cols(lp_problem_t& problem, i_t& num_empty_cols, @@ -40,9 +46,10 @@ i_t remove_empty_cols(lp_problem_t& problem, // Check to see if a variable participates in a quadratic objective std::vector has_quadratic_term(problem.num_cols, false); + i_t linear_cols = linear_variable_count(problem); if (problem.Q.n > 0) { - for (i_t j = 0; j < problem.num_cols; ++j) { + for (i_t j = 0; j < linear_cols; ++j) { const i_t row_start = problem.Q.row_start[j]; const i_t row_end = problem.Q.row_start[j + 1]; if (row_end - row_start == 0) { continue; } @@ -55,12 +62,13 @@ i_t remove_empty_cols(lp_problem_t& problem, i_t new_cols = 0; for (i_t j = 0; j < problem.num_cols; ++j) { bool remove_var = false; - if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { - if (problem.objective[j] >= 0 && problem.lower[j] > -inf && !has_quadratic_term[j]) { + if (j < linear_cols && (problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { + bool non_removable = has_quadratic_term[j]; + if (problem.objective[j] >= 0 && problem.lower[j] > -inf && !non_removable) { presolve_info.removed_values.push_back(problem.lower[j]); problem.obj_constant += problem.objective[j] * problem.lower[j]; remove_var = true; - } else if (problem.objective[j] <= 0 && problem.upper[j] < inf && !has_quadratic_term[j]) { + } else if (problem.objective[j] <= 0 && problem.upper[j] < inf && !non_removable) { presolve_info.removed_values.push_back(problem.upper[j]); problem.obj_constant += problem.objective[j] * problem.upper[j]; remove_var = true; @@ -123,6 +131,12 @@ i_t remove_empty_cols(lp_problem_t& problem, problem.Q.check_matrix("After removing empty columns"); } + if (!problem.second_order_cone_dims.empty()) { + i_t new_cone_start = col_old_to_new[problem.cone_var_start]; + assert(new_cone_start != -1); + problem.cone_var_start = new_cone_start; + } + problem.objective = objective; problem.lower = lower; problem.upper = upper; @@ -272,6 +286,113 @@ i_t convert_less_than_to_equal(const user_problem_t& user_problem, // We must convert rows in the form: a_i^T x <= beta // into: a_i^T x + s_i = beta, s_i >= 0 + if (!problem.second_order_cone_dims.empty()) { + const i_t old_num_cols = problem.num_cols; + const i_t linear_cols = linear_variable_count(problem); + const i_t num_slacks = less_rows; + const i_t num_cols = old_num_cols + num_slacks; + const i_t old_nnz = problem.A.col_start[old_num_cols]; + const i_t nnz = old_nnz + num_slacks; + const i_t new_cone_start = linear_cols + num_slacks; + + auto old_A = problem.A; + csc_matrix_t expanded_A(problem.A.m, num_cols, nnz); + + std::vector objective(num_cols, 0.0); + std::vector lower(num_cols, 0.0); + std::vector upper(num_cols, INFINITY); + std::vector old_to_new(old_num_cols, -1); + + for (i_t j = 0; j < linear_cols; ++j) { + old_to_new[j] = j; + objective[j] = problem.objective[j]; + lower[j] = problem.lower[j]; + upper[j] = problem.upper[j]; + } + for (i_t j = linear_cols; j < old_num_cols; ++j) { + old_to_new[j] = j + num_slacks; + objective[old_to_new[j]] = problem.objective[j]; + lower[old_to_new[j]] = problem.lower[j]; + upper[old_to_new[j]] = problem.upper[j]; + } + + i_t nz = 0; + for (i_t j = 0; j < linear_cols; ++j) { + expanded_A.col_start[j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + + i_t slack_col = linear_cols; + for (i_t i = 0; i < problem.num_rows; i++) { + if (row_sense[i] == 'L') { + expanded_A.col_start[slack_col] = nz; + expanded_A.i[nz] = i; + expanded_A.x[nz] = 1.0; + new_slacks.push_back(slack_col); + row_sense[i] = 'E'; + ++slack_col; + ++nz; + --less_rows; + } + } + + for (i_t j = linear_cols; j < old_num_cols; ++j) { + i_t new_j = old_to_new[j]; + expanded_A.col_start[new_j] = nz; + for (i_t p = old_A.col_start[j]; p < old_A.col_start[j + 1]; ++p) { + expanded_A.i[nz] = old_A.i[p]; + expanded_A.x[nz] = old_A.x[p]; + ++nz; + } + } + expanded_A.col_start[num_cols] = nz; + assert(less_rows == 0); + assert(slack_col == new_cone_start); + assert(nz == nnz); + + if (problem.Q.n > 0) { + const auto old_Q = problem.Q; + const i_t q_nnz = old_Q.row_start[old_num_cols]; + + problem.Q.row_start.assign(num_cols + 1, 0); + for (i_t row = 0; row < old_num_cols; ++row) { + i_t new_row = old_to_new[row]; + problem.Q.row_start[new_row + 1] = old_Q.row_start[row + 1] - old_Q.row_start[row]; + } + for (i_t row = 0; row < num_cols; ++row) { + problem.Q.row_start[row + 1] += problem.Q.row_start[row]; + } + + problem.Q.j.resize(q_nnz); + problem.Q.x.resize(q_nnz); + auto row_starts = problem.Q.row_start; + for (i_t row = 0; row < old_num_cols; ++row) { + i_t new_row = old_to_new[row]; + for (i_t p = old_Q.row_start[row]; p < old_Q.row_start[row + 1]; ++p) { + problem.Q.j[row_starts[new_row]] = old_to_new[old_Q.j[p]]; + problem.Q.x[row_starts[new_row]] = old_Q.x[p]; + ++row_starts[new_row]; + } + } + problem.Q.m = num_cols; + problem.Q.n = num_cols; + problem.Q.nz_max = q_nnz; + } + + problem.A = expanded_A; + problem.A.n = num_cols; + problem.objective = objective; + problem.lower = lower; + problem.upper = upper; + problem.num_cols = num_cols; + problem.cone_var_start = new_cone_start; + return 0; + } + i_t num_cols = problem.num_cols + less_rows; i_t nnz = problem.A.col_start[problem.num_cols] + less_rows; problem.A.col_start.resize(num_cols + 1); @@ -571,17 +692,19 @@ void convert_user_problem(const user_problem_t& user_problem, } // Copy info from user_problem to problem - problem.num_rows = user_problem.num_rows; - problem.num_cols = user_problem.num_cols; - problem.A = user_problem.A; - problem.objective = user_problem.objective; - problem.obj_scale = user_problem.obj_scale; - problem.obj_constant = user_problem.obj_constant; - problem.objective_is_integral = user_problem.objective_is_integral; - problem.objective_step = user_problem.objective_step; - problem.rhs = user_problem.rhs; - problem.lower = user_problem.lower; - problem.upper = user_problem.upper; + problem.num_rows = user_problem.num_rows; + problem.num_cols = user_problem.num_cols; + problem.A = user_problem.A; + problem.objective = user_problem.objective; + problem.obj_scale = user_problem.obj_scale; + problem.obj_constant = user_problem.obj_constant; + problem.objective_is_integral = user_problem.objective_is_integral; + problem.objective_step = user_problem.objective_step; + problem.rhs = user_problem.rhs; + problem.lower = user_problem.lower; + problem.upper = user_problem.upper; + problem.cone_var_start = user_problem.cone_var_start; + problem.second_order_cone_dims = user_problem.second_order_cone_dims; // Make a copy of row_sense so we can modify it std::vector row_sense = user_problem.row_sense; @@ -638,6 +761,7 @@ void convert_user_problem(const user_problem_t& user_problem, settings.log.debug( "equality rows %d less rows %d columns %d\n", equal_rows, less_rows, problem.num_cols); if (settings.barrier && settings.dualize != 0 && user_problem.Q_values.size() == 0 && + problem.second_order_cone_dims.empty() && (settings.dualize == 1 || (settings.dualize == -1 && less_rows > 1.2 * problem.num_cols && equal_rows < 2e4))) { settings.log.debug("Dualizing in presolve\n"); @@ -821,11 +945,15 @@ i_t presolve(const lp_problem_t& original, lp_problem_t& problem, presolve_info_t& presolve_info) { - problem = original; + problem = original; + const i_t linear_cols = linear_variable_count(problem); + const bool has_cones = !problem.second_order_cone_dims.empty(); std::vector row_sense(problem.num_rows, '='); - // Check for free variables + + // Check for free variables (linear block only; cone columns are handled by the barrier SOC + // layout) i_t free_variables = 0; - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } } @@ -836,7 +964,7 @@ i_t presolve(const lp_problem_t& original, std::vector row_marked(problem.num_rows, 0); current_free_variables.reserve(problem.num_cols); constraints_to_check.reserve(problem.num_rows); - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] == -inf && problem.upper[j] == inf) { current_free_variables.push_back(j); const i_t col_start = problem.A.col_start[j]; @@ -864,6 +992,25 @@ i_t presolve(const lp_problem_t& original, csr_matrix_t Arow(0, 0, 0); problem.A.to_compressed_row(Arow); + // Keep only rows safe for bound inference: no cone columns + if (has_cones) { + std::vector safe_constraints; + safe_constraints.reserve(constraints_to_check.size()); + for (i_t i : constraints_to_check) { + bool touches_cone = false; + for (i_t p = Arow.row_start[i]; p < Arow.row_start[i + 1]; ++p) { + const i_t j = Arow.j[p]; + if (j >= linear_cols) { + touches_cone = true; + continue; + } + } + if (touches_cone) { continue; } + safe_constraints.push_back(i); + } + constraints_to_check.swap(safe_constraints); + } + // The constraints are in the form: // sum_j a_j x_j = beta for (i_t i : constraints_to_check) { @@ -876,7 +1023,8 @@ i_t presolve(const lp_problem_t& original, i_t last_free_i = -1; f_t last_free_coeff_i = 0.0; for (i_t p = row_start; p < row_end; p++) { - const i_t j = Arow.j[p]; + const i_t j = Arow.j[p]; + if (j >= linear_cols) { continue; } const f_t aij = Arow.x[p]; const f_t lower_j = problem.lower[j]; const f_t upper_j = problem.upper[j]; @@ -928,8 +1076,9 @@ i_t presolve(const lp_problem_t& original, // And we can derive two bounds from this: // x_j >= 1/a_ij * (rhs - lower_activity_i) // x_j <= 1/a_ij * (rhs - upper_activity_i) - const i_t j = last_free_i; - const f_t a_ij = last_free_coeff_i; + const i_t j = last_free_i; + const f_t a_ij = last_free_coeff_i; + if (a_ij == 0) { continue; } const f_t max_bound = 1e10; bool bounded = false; if (a_ij > 0) { @@ -1012,16 +1161,16 @@ i_t presolve(const lp_problem_t& original, if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } } if (removed_free_variables != 0) { - settings.log.printf("Bounded %d free variable row(s) in presolve\n", + settings.log.printf("Bounded %d free variables in presolve\n", static_cast(removed_free_variables)); } } // The original problem may have a variable without a lower bound // but a finite upper bound - // -inf < x_j <= u_j + // -inf < x_j <= u_j (linear variables only) i_t no_lower_bound = 0; - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] == -inf && problem.upper[j] < inf) { no_lower_bound++; } } @@ -1032,7 +1181,7 @@ i_t presolve(const lp_problem_t& original, // Handle -inf < x_j <= u_j by substituting x'_j = -x_j, giving -u_j <= x'_j < inf if (settings.barrier_presolve && no_lower_bound > 0) { presolve_info.negated_variables.reserve(no_lower_bound); - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] == -inf && problem.upper[j] < inf) { presolve_info.negated_variables.push_back(j); @@ -1070,7 +1219,7 @@ i_t presolve(const lp_problem_t& original, // The original problem may have nonzero lower bounds // 0 != l_j <= x_j <= u_j i_t nonzero_lower_bounds = 0; - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; } } if (settings.barrier_presolve && nonzero_lower_bounds > 0) { @@ -1093,7 +1242,7 @@ i_t presolve(const lp_problem_t& original, // so we get the constant term c_j * l_j std::vector lower_bounds_removed(problem.num_cols, false); - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { lower_bounds_removed[j] = true; presolve_info.removed_lower_bounds[j] = problem.lower[j]; @@ -1102,7 +1251,7 @@ i_t presolve(const lp_problem_t& original, auto old_objective = problem.objective; if (problem.Q.n > 0) { - for (i_t row = 0; row < problem.num_cols; row++) { + for (i_t row = 0; row < linear_cols; row++) { i_t row_start = problem.Q.row_start[row]; i_t row_end = problem.Q.row_start[row + 1]; for (i_t p = row_start; p < row_end; p++) { @@ -1123,7 +1272,7 @@ i_t presolve(const lp_problem_t& original, } std::vector kahan_compensation(problem.num_rows, 0.0); - for (i_t j = 0; j < problem.num_cols; j++) { + for (i_t j = 0; j < linear_cols; j++) { if (lower_bounds_removed[j]) { i_t col_start = problem.A.col_start[j]; i_t col_end = problem.A.col_start[j + 1]; @@ -1161,7 +1310,7 @@ i_t presolve(const lp_problem_t& original, // Check for empty cols i_t num_empty_cols = 0; { - for (i_t j = 0; j < problem.num_cols; ++j) { + for (i_t j = 0; j < linear_cols; ++j) { if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { num_empty_cols++; } } } @@ -1170,19 +1319,32 @@ i_t presolve(const lp_problem_t& original, remove_empty_cols(problem, num_empty_cols, presolve_info); } + // Check for free variables (exclude cone variables — they are naturally unbounded) + free_variables = 0; + for (i_t j = 0; j < linear_cols; j++) { + if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } + } problem.Q.check_matrix("Before free variable expansion"); - // For QPs, keep free variables as-is rather than - // splitting x = v - w. The barrier solver handles them natively with a - // static regularizer on the diagonal instead of z/x complementarity terms. - if (settings.barrier_presolve && free_variables > 0 && problem.Q.n > 0) { - presolve_info.free_variable_indices.clear(); - for (i_t j = 0; j < problem.num_cols; j++) { + // Free linear variables. We handle them directly in QP/SOCP or split them in LP. + const bool direct_free_linear = + settings.barrier_presolve && free_variables > 0 && (problem.Q.n > 0 || has_cones); + if (direct_free_linear) { + presolve_info.free_variable_pairs.clear(); + presolve_info.direct_free_variables.clear(); + // Only free linear decision variables need to be handled; cone/stack columns + // are unbounded by construction and must not be counted here. + i_t direct_free_count = 0; + for (i_t j = 0; j < linear_cols; j++) { if (problem.lower[j] == -inf && problem.upper[j] == inf) { - presolve_info.free_variable_indices.push_back(j); + presolve_info.direct_free_variables.push_back(j); + direct_free_count++; } } - } else if (settings.barrier_presolve && free_variables > 0) { + settings.log.printf("Handling %d free variables directly in augmented system\n", + direct_free_count); + } else if (settings.barrier_presolve && !has_cones && free_variables > 0) { + // For pure LP problems (Q is empty and there are no cones in this branch) // We have a variable x_j: with -inf < x_j < inf // we create new variables v and w with 0 <= v, w and x_j = v - w // Constraints @@ -1195,7 +1357,6 @@ i_t presolve(const lp_problem_t& original, // becomes // sum_{k != j} c_k x_k + c_j v - c_j w - std::vector pair_index(problem.num_cols, -1); i_t num_cols = problem.num_cols + free_variables; i_t nnz = problem.A.col_start[problem.num_cols]; for (i_t j = 0; j < problem.num_cols; j++) { @@ -1229,112 +1390,21 @@ i_t presolve(const lp_problem_t& original, problem.objective[col] = -problem.objective[j]; presolve_info.free_variable_pairs[pair_count++] = j; presolve_info.free_variable_pairs[pair_count++] = col; - pair_index[j] = col; problem.A.col_start[++col] = q; problem.lower[j] = 0.0; } } - if (problem.Q.n > 0) { - std::vector row_counts(num_cols, 0); - i_t nz_count = problem.Q.row_start[problem.num_cols]; - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - row_counts[row] = q_end - q_start; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - if (pair_index[row] != -1 && pair_index[col] != -1) { - assert(pair_index[row] >= problem.num_cols); - assert(pair_index[col] >= problem.num_cols); - row_counts[row]++; - row_counts[pair_index[row]] += 2; - nz_count += 3; - } else if (pair_index[col] != -1) { - assert(pair_index[col] >= problem.num_cols); - row_counts[row]++; - nz_count++; - } else if (pair_index[row] != -1) { - assert(pair_index[row] >= problem.num_cols); - row_counts[pair_index[row]]++; - nz_count++; - } - } - } - - std::vector Q_row_start(num_cols + 1); - Q_row_start[0] = 0; - for (i_t row = 0; row < num_cols; row++) { - Q_row_start[row + 1] = Q_row_start[row] + row_counts[row]; - } - std::vector Q_j(nz_count); - std::vector Q_x(nz_count); - auto row_starts = Q_row_start; - // First copy the original Q ma - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - i_t q_nz = Q_row_start[row]; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - f_t qij = problem.Q.x[qj]; - Q_j[q_nz] = col; - Q_x[q_nz] = qij; - q_nz++; - } - row_starts[row] = q_nz; - } - - // Expand the Q matrix for the free variables - for (i_t row = 0; row < problem.Q.n; row++) { - i_t q_start = problem.Q.row_start[row]; - i_t q_end = problem.Q.row_start[row + 1]; - for (i_t qj = q_start; qj < q_end; qj++) { - i_t col = problem.Q.j[qj]; - f_t qij = problem.Q.x[qj]; - if (pair_index[row] != -1 && pair_index[col] != -1) { - Q_j[row_starts[row]] = pair_index[col]; - Q_x[row_starts[row]] = -qij; - row_starts[row]++; - - Q_j[row_starts[pair_index[row]]] = col; - Q_x[row_starts[pair_index[row]]] = -qij; - row_starts[pair_index[row]]++; - - Q_j[row_starts[pair_index[row]]] = pair_index[col]; - Q_x[row_starts[pair_index[row]]] = qij; - row_starts[pair_index[row]]++; - } else if (pair_index[col] != -1) { - Q_j[row_starts[row]] = pair_index[col]; - Q_x[row_starts[row]] = -qij; - row_starts[row]++; - } else if (pair_index[row] != -1) { - Q_j[row_starts[pair_index[row]]] = col; - Q_x[row_starts[pair_index[row]]] = -qij; - row_starts[pair_index[row]]++; - } - } - } - - problem.Q.m = problem.Q.n = num_cols; - problem.Q.nz_max = Q_row_start[num_cols]; - problem.Q.row_start = Q_row_start; - problem.Q.j = Q_j; - problem.Q.x = Q_x; - problem.Q.check_matrix("After free variable expansion"); - } - - // assert(problem.A.p[num_cols] == nnz); problem.A.n = num_cols; problem.num_cols = num_cols; } - if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0) { + if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0 && !has_cones) { folding(problem, settings, presolve_info); } // Check for dependent rows - bool check_dependent_rows = false; // settings.barrier; + bool check_dependent_rows = false; if (check_dependent_rows) { std::vector dependent_rows; constexpr i_t kOk = -1; @@ -1407,7 +1477,7 @@ void crush_primal_solution(const user_problem_t& user_problem, // including previously added slacks, are reset before writing new values. solution.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - solution[j] = user_solution[j]; + solution[user_col_to_problem_col(user_problem, problem, j)] = user_solution[j]; } std::vector primal_residual(problem.num_rows); @@ -1447,7 +1517,7 @@ void crush_primal_solution_with_slack(const user_problem_t& user_probl // Re-crush can be called with a reused output vector; clear stale entries first. solution.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - solution[j] = user_solution[j]; + solution[user_col_to_problem_col(user_problem, problem, j)] = user_solution[j]; } std::vector primal_residual(problem.num_rows); @@ -1494,9 +1564,9 @@ f_t crush_dual_solution(const user_problem_t& user_problem, for (i_t i = 0; i < user_problem.num_rows; i++) { y[i] = user_y[i]; } - z.resize(problem.num_cols); + z.assign(problem.num_cols, 0.0); for (i_t j = 0; j < user_problem.num_cols; j++) { - z[j] = user_z[j]; + z[user_col_to_problem_col(user_problem, problem, j)] = user_z[j]; } std::vector is_range_row(problem.num_rows, false); @@ -1563,6 +1633,17 @@ f_t crush_dual_solution(const user_problem_t& user_problem, return dual_res_inf; } +template +static i_t user_col_to_problem_col(const user_problem_t& user_problem, + const lp_problem_t& problem, + i_t user_col) +{ + if (user_problem.second_order_cone_dims.empty()) { return user_col; } + if (problem.cone_var_start <= user_problem.cone_var_start) { return user_col; } + if (user_col < user_problem.cone_var_start) { return user_col; } + return problem.cone_var_start + (user_col - user_problem.cone_var_start); +} + template void uncrush_primal_solution(const user_problem_t& user_problem, const lp_problem_t& problem, @@ -1572,9 +1653,9 @@ void uncrush_primal_solution(const user_problem_t& user_problem, user_solution.resize(user_problem.num_cols); assert(problem.num_cols >= user_problem.num_cols); assert(solution.size() >= user_problem.num_cols); - std::copy(solution.begin(), - solution.begin() + std::min((i_t)solution.size(), user_problem.num_cols), - user_solution.data()); + for (i_t j = 0; j < user_problem.num_cols; ++j) { + user_solution[j] = solution[user_col_to_problem_col(user_problem, problem, j)]; + } } template @@ -1770,10 +1851,13 @@ void uncrush_solution(const presolve_info_t& presolve_info, // Then you can show that A^T y_bar + z_bar = c + Qx and // z_bar_{j_f} = 0. if (!presolve_info.bounded_free_variables.empty()) { - settings.log.printf("Post-solve: Correcting duals for %d bounded free variables\n", - static_cast(presolve_info.bounded_free_variables.size())); + const i_t num_bfv = static_cast(presolve_info.bounded_free_variables.size()); + settings.log.printf("Post-solve: Correcting duals for %d bounded free variables\n", num_bfv); const csc_matrix_t& A = original_problem.A; + // Traverse in reverse order, to ensure that all z_j = 0 after the correction + csr_matrix_t Arow(0, 0, 0); + A.to_compressed_row(Arow); for (auto it = presolve_info.bounded_free_variables.rbegin(); it != presolve_info.bounded_free_variables.rend(); ++it) { @@ -1782,15 +1866,10 @@ void uncrush_solution(const presolve_info_t& presolve_info, if (w_j == 0.0) { continue; } const f_t du = w_j / bfv.coefficient; input_y[bfv.constraint] += du; - for (i_t j = 0; j < A.n; j++) { - const i_t col_start = A.col_start[j]; - const i_t col_end = A.col_start[j + 1]; - for (i_t p = col_start; p < col_end; p++) { - if (A.i[p] == bfv.constraint) { - input_z[j] -= A.x[p] * du; - break; - } - } + const i_t row_start = Arow.row_start[bfv.constraint]; + const i_t row_end = Arow.row_start[bfv.constraint + 1]; + for (i_t p = row_start; p < row_end; ++p) { + input_z[Arow.j[p]] -= Arow.x[p] * du; } } } diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp index 3bb9135fae..c6587c9d79 100644 --- a/cpp/src/dual_simplex/presolve.hpp +++ b/cpp/src/dual_simplex/presolve.hpp @@ -50,6 +50,8 @@ struct lp_problem_t { f_t obj_scale; // 1.0 for min, -1.0 for max bool objective_is_integral{false}; objective_step_t objective_step; + i_t cone_var_start{0}; + std::vector second_order_cone_dims; void write_mps(const std::string& path) const { @@ -163,10 +165,13 @@ struct presolve_info_t { // Variables that were negated to handle -inf < x_j <= u_j std::vector negated_variables; + + // Free variable indices that the barrier solver handles directly in the augmented system + // (not split into v - w). Used for QP/SOCP. + std::vector direct_free_variables; + // Originally-free variables that received implied bounds, with the constraint used std::vector> bounded_free_variables; - // Free variable indices for QP augmented system (not split, handled natively) - std::vector free_variable_indices; }; template diff --git a/cpp/src/dual_simplex/scaling.cpp b/cpp/src/dual_simplex/scaling.cpp index 1531c91486..92b4d3377d 100644 --- a/cpp/src/dual_simplex/scaling.cpp +++ b/cpp/src/dual_simplex/scaling.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -13,16 +13,180 @@ namespace cuopt::linear_programming::dual_simplex { template -i_t column_scaling(const lp_problem_t& unscaled, - const simplex_solver_settings_t& settings, - lp_problem_t& scaled, - std::vector& column_scaling) +i_t scaling(const lp_problem_t& unscaled, + const simplex_solver_settings_t& settings, + lp_problem_t& scaled, + std::vector& column_scaling, + std::vector& row_scaling) { scaled = unscaled; i_t m = scaled.num_rows; i_t n = scaled.num_cols; - if (!settings.scale_columns || unscaled.Q.n > 0) { + row_scaling.assign(m, 1.0); + + // ========================================================================= + // Ruiz equilibration for SOCP (and QP) problems + // ========================================================================= + // For SOCP problems, apply Ruiz equilibration: alternating row and column + // infinity-norm scaling to bring the constraint matrix close to equilibrium. + // This dramatically improves the conditioning of the augmented KKT system. + // Applied only when the constraint matrix has a large row-norm imbalance. + if (!unscaled.second_order_cone_dims.empty() || unscaled.Q.n > 0) { + // col_scale and row_scale accumulate reciprocal scale factors during Ruiz iterations. + std::vector col_scale(n, 1.0); + + // Decide whether Ruiz scaling is needed by checking row-norm imbalance. + // If max_row_norm / min_row_norm is small, the matrix is already well-conditioned + // and scaling can hurt (e.g. by amplifying tiny noise coefficients). + csr_matrix_t Arow_check(0, 0, 0); + scaled.A.to_compressed_row(Arow_check); + f_t max_row_norm = 0; + f_t min_row_norm = std::numeric_limits::max(); + for (i_t i = 0; i < m; ++i) { + f_t row_norm = 0; + for (i_t p = Arow_check.row_start[i]; p < Arow_check.row_start[i + 1]; ++p) { + f_t a = std::abs(Arow_check.x[p]); + if (a > row_norm) row_norm = a; + } + if (row_norm > 0) { + max_row_norm = std::max(max_row_norm, row_norm); + min_row_norm = std::min(min_row_norm, row_norm); + } + } + f_t row_norm_ratio = (min_row_norm > 0) ? max_row_norm / min_row_norm : 1.0; + + if (row_norm_ratio < 100.0) { + settings.log.printf("Skipping Ruiz equilibration (row norm ratio %.1f < 100)\n", + row_norm_ratio); + column_scaling.assign(n, 1.0); + return 0; + } + + // Apply Ruiz equilibration + csr_matrix_t Arow(0, 0, 0); + scaled.A.to_compressed_row(Arow); + + constexpr i_t max_ruiz_iterations = 10; + for (i_t iter = 0; iter < max_ruiz_iterations; ++iter) { + f_t max_deviation = 0.0; + + // --- Row scaling: scale each row by 1/sqrt(max|a_ij|) --- + std::vector r(m); + for (i_t i = 0; i < m; ++i) { + f_t rm = 0.0; + for (i_t p = Arow.row_start[i]; p < Arow.row_start[i + 1]; ++p) { + f_t a = std::abs(Arow.x[p]); + if (a > rm) rm = a; + } + r[i] = rm > 0 ? 1.0 / std::sqrt(rm) : 1.0; + max_deviation = std::max(max_deviation, std::abs(rm - 1.0)); + } + for (i_t j = 0; j < n; ++j) { + for (i_t p = scaled.A.col_start[j]; p < scaled.A.col_start[j + 1]; ++p) { + scaled.A.x[p] *= r[scaled.A.i[p]]; + } + } + for (i_t i = 0; i < m; ++i) { + for (i_t p = Arow.row_start[i]; p < Arow.row_start[i + 1]; ++p) { + Arow.x[p] *= r[i]; + } + scaled.rhs[i] *= r[i]; + row_scaling[i] *= r[i]; + } + + // --- Column scaling: scale each column by 1/sqrt(max|a_ij|) --- + // For cone variables, use a uniform scale per cone block to preserve SOC structure. + std::vector c(n); + const i_t cone_start = unscaled.second_order_cone_dims.empty() ? n : unscaled.cone_var_start; + + // Linear columns: scale independently + for (i_t j = 0; j < cone_start; ++j) { + f_t cm = 0.0; + for (i_t p = scaled.A.col_start[j]; p < scaled.A.col_start[j + 1]; ++p) { + f_t a = std::abs(scaled.A.x[p]); + if (a > cm) cm = a; + } + c[j] = cm > 0 ? 1.0 / std::sqrt(cm) : 1.0; + max_deviation = std::max(max_deviation, std::abs(cm - 1.0)); + } + + // Cone columns: uniform scale per cone block + i_t cone_off = cone_start; + for (i_t k = 0; k < static_cast(unscaled.second_order_cone_dims.size()); ++k) { + i_t q_k = unscaled.second_order_cone_dims[k]; + // Find max column inf-norm across all columns in this cone + f_t cone_max = 0.0; + for (i_t j = cone_off; j < cone_off + q_k; ++j) { + for (i_t p = scaled.A.col_start[j]; p < scaled.A.col_start[j + 1]; ++p) { + f_t a = std::abs(scaled.A.x[p]); + if (a > cone_max) cone_max = a; + } + } + f_t cone_scale = cone_max > 0 ? 1.0 / std::sqrt(cone_max) : 1.0; + max_deviation = std::max(max_deviation, std::abs(cone_max - 1.0)); + for (i_t j = cone_off; j < cone_off + q_k; ++j) { + c[j] = cone_scale; + } + cone_off += q_k; + } + for (i_t j = 0; j < n; ++j) { + for (i_t p = scaled.A.col_start[j]; p < scaled.A.col_start[j + 1]; ++p) { + scaled.A.x[p] *= c[j]; + } + } + for (i_t i = 0; i < m; ++i) { + for (i_t p = Arow.row_start[i]; p < Arow.row_start[i + 1]; ++p) { + Arow.x[p] *= c[Arow.j[p]]; + } + } + for (i_t j = 0; j < n; ++j) { + scaled.objective[j] *= c[j]; + col_scale[j] *= c[j]; + } + // Bounds use +/-inf for unbounded sides (see types.hpp). Use +/-1e20 as a practical + // sentinel: we do not expect finite bounds beyond this magnitude, and skipping scale + // on |bound| >= 1e20 avoids overflow when dividing very large limits by small c[j]. + for (i_t j = 0; j < n; ++j) { + if (scaled.lower[j] > -1e20) scaled.lower[j] /= c[j]; + if (scaled.upper[j] < 1e20) scaled.upper[j] /= c[j]; + } + if (scaled.Q.n > 0) { + for (i_t row = 0; row < scaled.Q.m; ++row) { + for (i_t p = scaled.Q.row_start[row]; p < scaled.Q.row_start[row + 1]; ++p) { + i_t col = scaled.Q.j[p]; + scaled.Q.x[p] *= c[row] * c[col]; + } + } + } + if (max_deviation < 0.1) break; + } + + // Ruiz col_scale/row_scaling accumulate reciprocals (c[j] = 1/sqrt(norm)). + // Invert to match the output convention: C(j,j) = 1/column_scaling[j], + // R(i,i) = 1/row_scaling[i]. + column_scaling.resize(n); + for (i_t j = 0; j < n; ++j) { + column_scaling[j] = f_t(1) / col_scale[j]; + } + for (i_t i = 0; i < m; ++i) { + row_scaling[i] = f_t(1) / row_scaling[i]; + } + + f_t a_min = std::numeric_limits::max(); + f_t a_max = 0; + for (i_t p = 0; p < scaled.A.col_start[n]; ++p) { + f_t a = std::abs(scaled.A.x[p]); + if (a > 0) { + a_min = std::min(a_min, a); + a_max = std::max(a_max, a); + } + } + settings.log.printf("Ruiz equilibration: coefficient range [%e, %e]\n", a_min, a_max); + return 0; + } + + if (!settings.scale_columns) { settings.log.printf("Skipping column scaling\n"); column_scaling.resize(n, 1.0); return 0; @@ -79,9 +243,12 @@ i_t column_scaling(const lp_problem_t& unscaled, template void unscale_solution(const std::vector& column_scaling, + const std::vector& row_scaling, const std::vector& scaled_x, + const std::vector& scaled_y, const std::vector& scaled_z, std::vector& unscaled_x, + std::vector& unscaled_y, std::vector& unscaled_z) { const i_t n = scaled_x.size(); @@ -91,19 +258,30 @@ void unscale_solution(const std::vector& column_scaling, unscaled_x[j] = scaled_x[j] / column_scaling[j]; unscaled_z[j] = scaled_z[j] * column_scaling[j]; } + + const i_t m = scaled_y.size(); + unscaled_y.resize(m); + // R(i,i) = 1/row_scaling[i], so y_orig = y_scaled / row_scaling + for (i_t i = 0; i < m; ++i) { + unscaled_y[i] = scaled_y[i] / row_scaling[i]; + } } #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE -template int column_scaling(const lp_problem_t& unscaled, - const simplex_solver_settings_t& settings, - lp_problem_t& scaled, - std::vector& column_scaling); +template int scaling(const lp_problem_t& unscaled, + const simplex_solver_settings_t& settings, + lp_problem_t& scaled, + std::vector& column_scaling, + std::vector& row_scaling); template void unscale_solution(const std::vector& column_scaling, + const std::vector& row_scaling, const std::vector& scaled_x, + const std::vector& scaled_y, const std::vector& scaled_z, std::vector& unscaled_x, + std::vector& unscaled_y, std::vector& unscaled_z); #endif diff --git a/cpp/src/dual_simplex/scaling.hpp b/cpp/src/dual_simplex/scaling.hpp index 120660c765..df0bf4d845 100644 --- a/cpp/src/dual_simplex/scaling.hpp +++ b/cpp/src/dual_simplex/scaling.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,16 +17,20 @@ namespace cuopt::linear_programming::dual_simplex { template -i_t column_scaling(const lp_problem_t& unscaled, - const simplex_solver_settings_t& settings, - lp_problem_t& scaled, - std::vector& column_scaling); +i_t scaling(const lp_problem_t& unscaled, + const simplex_solver_settings_t& settings, + lp_problem_t& scaled, + std::vector& column_scaling, + std::vector& row_scaling); template void unscale_solution(const std::vector& column_scaling, + const std::vector& row_scaling, const std::vector& scaled_x, + const std::vector& scaled_y, const std::vector& scaled_z, std::vector& unscaled_x, + std::vector& unscaled_y, std::vector& unscaled_z); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 9d8a62efc3..d81265358a 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -170,9 +170,10 @@ lp_status_t solve_linear_program_with_advanced_basis( presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); std::vector column_scales; + std::vector row_scales_simplex; { raft::common::nvtx::range scope_scaling("DualSimplex::scaling"); - column_scaling(presolved_lp, settings, lp, column_scales); + scaling(presolved_lp, settings, lp, column_scales, row_scales_simplex); } assert(presolved_lp.num_cols == lp.num_cols); lp_problem_t phase1_problem(original_lp.handle_ptr, 1, 1, 1); @@ -293,13 +294,21 @@ lp_status_t solve_linear_program_with_advanced_basis( } if (status == dual::status_t::OPTIMAL) { std::vector unscaled_x(lp.num_cols); + std::vector unscaled_y(lp.num_rows); std::vector unscaled_z(lp.num_cols); - unscale_solution(column_scales, solution.x, solution.z, unscaled_x, unscaled_z); + unscale_solution(column_scales, + row_scales_simplex, + solution.x, + solution.y, + solution.z, + unscaled_x, + unscaled_y, + unscaled_z); uncrush_solution(presolve_info, settings, original_lp, unscaled_x, - solution.y, + unscaled_y, unscaled_z, original_solution.x, original_solution.y, @@ -350,9 +359,12 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us // Convert the user problem to a linear program with only equality constraints std::vector new_slacks; simplex_solver_settings_t barrier_settings = settings; - barrier_settings.barrier_presolve = true; dualize_info_t dualize_info; convert_user_problem(user_problem, barrier_settings, original_lp, new_slacks, dualize_info); + if (!validate_barrier_cone_layout(original_lp, barrier_settings)) { + return lp_status_t::NUMERICAL_ISSUES; + } + lp_solution_t lp_solution(original_lp.num_rows, original_lp.num_cols); // Presolve the linear program @@ -369,7 +381,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); std::vector column_scales; - column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); + std::vector row_scales; + scaling(presolved_lp, barrier_settings, barrier_lp, column_scales, row_scales); // Solve using barrier lp_solution_t barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols); @@ -394,9 +407,16 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us #endif // Unscale the solution std::vector unscaled_x(barrier_lp.num_cols); + std::vector unscaled_y(barrier_lp.num_rows); std::vector unscaled_z(barrier_lp.num_cols); - unscale_solution( - column_scales, barrier_solution.x, barrier_solution.z, unscaled_x, unscaled_z); + unscale_solution(column_scales, + row_scales, + barrier_solution.x, + barrier_solution.y, + barrier_solution.z, + unscaled_x, + unscaled_y, + unscaled_z); std::vector residual = presolved_lp.rhs; matrix_vector_multiply(presolved_lp.A, 1.0, unscaled_x, -1.0, residual); @@ -410,7 +430,7 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us unscaled_dual_residual[j] -= presolved_lp.objective[j]; } matrix_transpose_vector_multiply( - presolved_lp.A, 1.0, barrier_solution.y, 1.0, unscaled_dual_residual); + presolved_lp.A, 1.0, unscaled_y, 1.0, unscaled_dual_residual); f_t unscaled_dual_residual_norm = vector_norm_inf(unscaled_dual_residual); settings.log.printf( "Unscaled Dual infeasibility (abs/rel): %.2e/%.2e\n", @@ -423,7 +443,7 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us barrier_settings, original_lp, unscaled_x, - barrier_solution.y, + unscaled_y, unscaled_z, lp_solution.x, lp_solution.y, @@ -564,7 +584,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); uncrush_dual_solution( user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z); - solution.objective = barrier_solution.objective; + solution.objective = + barrier_solution.user_objective / user_problem.obj_scale - user_problem.obj_constant; solution.user_objective = barrier_solution.user_objective; solution.l2_primal_residual = barrier_solution.l2_primal_residual; solution.l2_dual_residual = barrier_solution.l2_dual_residual; diff --git a/cpp/src/dual_simplex/user_problem.hpp b/cpp/src/dual_simplex/user_problem.hpp index 45869738db..548ec9e449 100644 --- a/cpp/src/dual_simplex/user_problem.hpp +++ b/cpp/src/dual_simplex/user_problem.hpp @@ -63,6 +63,12 @@ struct user_problem_t { std::vector Q_offsets; std::vector Q_indices; std::vector Q_values; + i_t cone_var_start{0}; + std::vector second_order_cone_dims; + // Column count before QCMATRIX->SOC expansion. When > 0, the barrier solution is in the + // expanded layout (num_cols) and must be projected back via original_col_to_expanded_col. + i_t original_num_cols{0}; + std::vector original_col_to_expanded_col; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/vector_math.cuh b/cpp/src/dual_simplex/vector_math.cuh index abc7263858..32c75ea366 100644 --- a/cpp/src/dual_simplex/vector_math.cuh +++ b/cpp/src/dual_simplex/vector_math.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,6 +9,10 @@ #include +#include +#include +#include + #include #include @@ -28,6 +32,7 @@ struct norm_inf_max { template f_t device_custom_vector_norm_inf(InputIteratorT in, i_t size, rmm::cuda_stream_view stream_view) { + if (size == 0) { return 0; } // FIXME: Tmp storage stored in vector_math class. auto d_out = rmm::device_scalar(stream_view); rmm::device_uvector d_temp_storage(0, stream_view); @@ -62,6 +67,12 @@ f_t device_vector_norm_inf(const rmm::device_uvector& in, rmm::cuda_stream_ return device_custom_vector_norm_inf(in.data(), in.size(), stream_view); } +template +f_t device_vector_norm_inf(raft::device_span in, rmm::cuda_stream_view stream_view) +{ + return device_custom_vector_norm_inf(in.data(), in.size(), stream_view); +} + // TMP we should just have a CPU and GPU version to do the comparison // Should never have to norm inf a CPU vector if we are using the GPU template @@ -71,4 +82,12 @@ f_t vector_norm_inf(const std::vector& x, rmm::cuda_stream_view return device_vector_norm_inf(d_x, stream_view); } +template +f_t vector_norm_inf(raft::host_span x, rmm::cuda_stream_view stream_view) +{ + rmm::device_uvector d_x(x.size(), stream_view); + raft::copy(d_x.data(), x.data(), x.size(), stream_view); + return device_vector_norm_inf(d_x, stream_view); +} + } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/io/lp_parser.cpp b/cpp/src/io/lp_parser.cpp index 4f8c3f223f..c72b1c7c82 100644 --- a/cpp/src/io/lp_parser.cpp +++ b/cpp/src/io/lp_parser.cpp @@ -118,42 +118,41 @@ bool is_free_keyword(std::string_view lower) { return lower == "free"; } bool is_infinity_text(std::string_view lower) { return lower == "inf" || lower == "infinity"; } -// Builds the symmetric Q in CSR from LP-format raw upper-triangular triples. +// Builds the symmetric Q in COO from LP-format raw upper-triangular triples. // Each input triple (i, j, c) with i <= j represents `c * x_i * x_j` in the // LP source. The output Q satisfies x^T Q x = sum of those terms. // Diagonal (i == j): Q[i,i] = c (one entry). // Off-diagonal (i != j): Q[i,j] = Q[j,i] = c/2 (two entries; symmetric split). template -void build_symmetric_q_csr(const std::vector>& raw_triples, - i_t n_vars, - std::vector& out_values, - std::vector& out_indices, - std::vector& out_offsets) +void build_symmetric_q_coo(const coo_entries_t& raw_triples, + std::vector& out_row_indices, + std::vector& out_col_indices, + std::vector& out_values) { - std::vector>> row_data(n_vars); - for (const auto& [i, j, c] : raw_triples) { + out_row_indices.clear(); + out_col_indices.clear(); + out_values.clear(); + out_row_indices.reserve(raw_triples.size() * 2); + out_col_indices.reserve(raw_triples.size() * 2); + out_values.reserve(raw_triples.size() * 2); + + for (size_t p = 0; p < raw_triples.size(); p++) { + const i_t i = raw_triples.rows[p]; + const i_t j = raw_triples.cols[p]; + const f_t c = raw_triples.vals[p]; if (i == j) { - row_data[i].emplace_back(i, c); + out_row_indices.push_back(i); + out_col_indices.push_back(i); + out_values.push_back(c); } else { - row_data[i].emplace_back(j, c / f_t(2)); - row_data[j].emplace_back(i, c / f_t(2)); + out_row_indices.push_back(i); + out_col_indices.push_back(j); + out_values.push_back(c / 2); + out_row_indices.push_back(j); + out_col_indices.push_back(i); + out_values.push_back(c / 2); } } - for (auto& row : row_data) { - std::sort(row.begin(), row.end()); - } - out_offsets.clear(); - out_indices.clear(); - out_values.clear(); - out_offsets.reserve(static_cast(n_vars) + 1); - out_offsets.push_back(0); - for (i_t r = 0; r < n_vars; ++r) { - for (const auto& [col, val] : row_data[r]) { - out_values.push_back(val); - out_indices.push_back(col); - } - out_offsets.push_back(static_cast(out_values.size())); - } } // =========================================================================== @@ -264,7 +263,7 @@ class LpParseEngine { enum class BracketRole { Objective, Constraint }; void parse_quadratic_bracket(int outer_sign, BracketRole role, - std::vector>& out_quad_entries); + coo_entries_t& out_quad_entries); // Atomic readers. f_t parse_signed_number(); @@ -830,15 +829,16 @@ void LpParseEngine::parse_linear_expression(std::vector& o } template -void LpParseEngine::parse_quadratic_bracket( - int outer_sign, BracketRole role, std::vector>& out_quad_entries) +void LpParseEngine::parse_quadratic_bracket(int outer_sign, + BracketRole role, + coo_entries_t& out_quad_entries) { expect(LpTokenKind::LBracket, "'[' at start of quadratic section"); // Accumulate raw LP-format entries first (diagonal vs off-diagonal), then // apply the role-specific convention and outer sign after we see the // closing bracket. - std::vector> raw_quad; + coo_entries_t raw_quad; int sign = 1; bool first = true; @@ -939,9 +939,12 @@ void LpParseEngine::parse_quadratic_bracket( // directly to cuOpt's set_quadratic_objective_matrix, which internally // computes H = Q + Q^T; the solver then minimizes (1/2) x^T H x, which // recovers the user's intended objective. - for (auto& [a, b, v] : raw_quad) { - v /= f_t(2); - out_quad_entries.emplace_back(a, b, sign_scale * v); + out_quad_entries.rows.insert( + out_quad_entries.rows.end(), raw_quad.rows.begin(), raw_quad.rows.end()); + out_quad_entries.cols.insert( + out_quad_entries.cols.end(), raw_quad.cols.begin(), raw_quad.cols.end()); + for (size_t p = 0; p < raw_quad.size(); p++) { + out_quad_entries.vals.push_back(sign_scale * raw_quad.vals[p] / f_t(2)); } } else { // Constraint: '/ 2' is forbidden — the LP convention is that constraint @@ -964,8 +967,12 @@ void LpParseEngine::parse_quadratic_bracket( // Coefficients are at face value — the post-pass that flushes the // quadratic_constraint_block_t to the data model handles the symmetric // expansion and the /2 split for off-diagonals. - for (auto& [a, b, v] : raw_quad) { - out_quad_entries.emplace_back(a, b, sign_scale * v); + out_quad_entries.rows.insert( + out_quad_entries.rows.end(), raw_quad.rows.begin(), raw_quad.rows.end()); + out_quad_entries.cols.insert( + out_quad_entries.cols.end(), raw_quad.cols.begin(), raw_quad.cols.end()); + for (size_t p = 0; p < raw_quad.size(); p++) { + out_quad_entries.vals.push_back(sign_scale * raw_quad.vals[p]); } } } @@ -1044,7 +1051,7 @@ void LpParseEngine::parse_constraints_section() // Mirrors the objective handling; if present, this row becomes a // quadratic constraint and is stored on quadratic_constraint_blocks // instead of the linear arrays. - std::vector> qc_triples; + coo_entries_t qc_triples; bool is_quadratic_row = false; int quad_sign = 1; if (peek().kind == LpTokenKind::Plus && peek(1).kind == LpTokenKind::LBracket) { @@ -1473,8 +1480,9 @@ void finalize_problem(mps_data_model_t& problem, lp_parser_t // recovers the user's intended objective. if (!parser.quadobj_entries.empty()) { std::vector>> row_data(n_vars); - for (const auto& [row, col, val] : parser.quadobj_entries) { - row_data[row].emplace_back(col, val); + for (size_t p = 0; p < parser.quadobj_entries.size(); p++) { + row_data[parser.quadobj_entries.rows[p]].emplace_back(parser.quadobj_entries.cols[p], + parser.quadobj_entries.vals[p]); } for (auto& row : row_data) { std::sort(row.begin(), row.end()); @@ -1503,14 +1511,13 @@ template void flush_quadratic_constraints(mps_data_model_t& problem, const lp_parser_t& parser) { - const i_t n_vars = static_cast(parser.var_names.size()); const i_t linear_row_count = static_cast(parser.row_names.size()); - i_t k = 0; - for (const auto& block : parser.quadratic_constraint_blocks) { + for (i_t k = 0; k < static_cast(parser.quadratic_constraint_blocks.size()); k++) { + const auto& block = parser.quadratic_constraint_blocks[k]; + std::vector q_row_indices; + std::vector q_col_indices; std::vector q_values; - std::vector q_indices; - std::vector q_offsets; - build_symmetric_q_csr(block.quad_triples, n_vars, q_values, q_indices, q_offsets); + build_symmetric_q_coo(block.quad_triples, q_row_indices, q_col_indices, q_values); problem.append_quadratic_constraint(linear_row_count + k, block.row_name, static_cast(block.row_type), @@ -1518,9 +1525,8 @@ void flush_quadratic_constraints(mps_data_model_t& problem, block.linear_indices, block.rhs_value, q_values, - q_indices, - q_offsets); - ++k; + q_row_indices, + q_col_indices); } } diff --git a/cpp/src/io/lp_parser.hpp b/cpp/src/io/lp_parser.hpp index a607359657..83f61785be 100644 --- a/cpp/src/io/lp_parser.hpp +++ b/cpp/src/io/lp_parser.hpp @@ -12,7 +12,6 @@ #include #include -#include #include namespace cuopt::linear_programming::io { @@ -59,7 +58,7 @@ class lp_parser_t { // Quadratic objective entries (row, col, value) in upper-triangular // QUADOBJ convention; finalize_problem() mirrors to the full symmetric // matrix and applies the *0.5 factor required by cuOpt's x^T Q x form. - std::vector> quadobj_entries{}; + coo_entries_t quadobj_entries{}; // Per-row data for constraints whose LHS contains a quadratic bracket. // These rows do NOT appear in row_names/row_types/A_indices/A_values/ @@ -77,7 +76,7 @@ class lp_parser_t { // Upper-triangular (i <= j) raw triples directly from the LP source // (face value, no /2). The post-pass mirrors and halves off-diagonals // to build the symmetric Q in CSR. - std::vector> quad_triples{}; + coo_entries_t quad_triples{}; }; std::vector quadratic_constraint_blocks{}; }; diff --git a/cpp/src/io/mps_data_model.cpp b/cpp/src/io/mps_data_model.cpp index 7ae359e450..991c8d42a1 100644 --- a/cpp/src/io/mps_data_model.cpp +++ b/cpp/src/io/mps_data_model.cpp @@ -9,6 +9,7 @@ #include #include +#include #include namespace cuopt::linear_programming::io { @@ -145,26 +146,40 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ std::span linear_values, std::span linear_indices, f_t rhs_value, - std::span quadratic_values, - std::span quadratic_indices, - std::span quadratic_offsets) + std::span vals, + std::span rows, + std::span cols) { mps_parser_expects(constraint_row_index >= 0, error_type_t::ValidationError, "constraint_row_index must be non-negative"); - mps_parser_expects(constraint_row_type == 'L', + mps_parser_expects(constraint_row_type == 'L' || constraint_row_type == 'G', error_type_t::ValidationError, - "Quadratic constraint ROWS type must be 'L' (less-or-equal); got '%c'. " - "Only 'L' is supported for convex quadratic constraints.", + "Quadratic constraint ROWS type must be 'L' (<=) or 'G' (>=); got '%c'.", constraint_row_type); mps_parser_expects(linear_values.size() == linear_indices.size(), error_type_t::ValidationError, "linear_values and linear_indices must have the same nnz count"); + const size_t q_nnz = vals.size(); mps_parser_expects( - !quadratic_offsets.empty(), error_type_t::ValidationError, "quadratic_offsets cannot be empty"); + q_nnz == rows.size(), error_type_t::ValidationError, "vals and rows must have the same length"); + mps_parser_expects( + q_nnz == cols.size(), error_type_t::ValidationError, "vals and cols must have the same length"); + + if (!linear_values.empty()) { + mps_parser_expects(linear_values.data() != nullptr && linear_indices.data() != nullptr, + error_type_t::ValidationError, + "linear_values and linear_indices cannot be null when non-empty"); + } + + if (q_nnz > 0) { + mps_parser_expects(vals.data() != nullptr && rows.data() != nullptr && cols.data() != nullptr, + error_type_t::ValidationError, + "Q COO spans cannot be null when nnz > 0"); + } quadratic_constraint_t qc; qc.constraint_row_index = constraint_row_index; @@ -173,9 +188,33 @@ void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_ qc.rhs_value = rhs_value; qc.linear_values.assign(linear_values.begin(), linear_values.end()); qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); - qc.quadratic_values.assign(quadratic_values.begin(), quadratic_values.end()); - qc.quadratic_indices.assign(quadratic_indices.begin(), quadratic_indices.end()); - qc.quadratic_offsets.assign(quadratic_offsets.begin(), quadratic_offsets.end()); + + if (q_nnz == 0) { + qc.rows.clear(); + qc.cols.clear(); + qc.vals.clear(); + } else { + std::vector wr(rows.begin(), rows.end()); + std::vector wc(cols.begin(), cols.end()); + std::vector wv(vals.begin(), vals.end()); + + std::vector perm(q_nnz); + std::iota(perm.begin(), perm.end(), size_t{0}); + std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) { + if (wr[a] != wr[b]) { return wr[a] < wr[b]; } + return wc[a] < wc[b]; + }); + + qc.rows.resize(q_nnz); + qc.cols.resize(q_nnz); + qc.vals.resize(q_nnz); + for (size_t t = 0; t < q_nnz; ++t) { + const size_t ix = perm[t]; + qc.rows[t] = wr[ix]; + qc.cols[t] = wc[ix]; + qc.vals[t] = wv[ix]; + } + } quadratic_constraints_.push_back(std::move(qc)); } diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp index 535938b09c..5f7cecda94 100644 --- a/cpp/src/io/mps_parser.cpp +++ b/cpp/src/io/mps_parser.cpp @@ -23,8 +23,10 @@ #include namespace { +using cuopt::linear_programming::io::coo_entries_t; using cuopt::linear_programming::io::error_type_t; using cuopt::linear_programming::io::mps_parser_expects; +using cuopt::linear_programming::io::mps_parser_expects_fatal; std::vector string_to_buffer(std::string_view input) { @@ -34,6 +36,126 @@ std::vector string_to_buffer(std::string_view input) } } // end namespace +namespace { + +/** + * @brief Reusable scratch for converting (row,col,value) triples to CSR via CSC (double transpose). + * + * Avoids `std::vector>` per column/row, which is allocation-heavy for large Q + * blocks (e.g. many QCMATRIX sections in portfolio models). + */ +template +struct triples_to_csr_scratch_t { + std::vector col_nnz{}; + std::vector col_off{}; + std::vector col_wr{}; + std::vector csc_rows{}; + std::vector csc_vals{}; + std::vector row_nnz{}; + std::vector row_off{}; + std::vector row_wr{}; +}; + +/** + * @brief Build CSR from coordinate triples via CSC (double transpose): column buckets, then CSR + * with column indices ascending within each row. + * + * @param symmetrize_upper_triangular If true (QUADOBJ), each off-diagonal (r,c) also adds (c,r). + */ +template +void triples_to_csr_flat(const coo_entries_t& entries, + i_t num_rows, + i_t num_cols, + bool symmetrize_upper_triangular, + f_t value_scale, + triples_to_csr_scratch_t& scratch, + std::vector& out_values, + std::vector& out_indices, + std::vector& out_offsets) +{ + if (entries.empty()) { + out_values.clear(); + out_indices.clear(); + out_offsets.assign(num_rows + 1, 0); + return; + } + + const i_t n_entries = static_cast(entries.size()); + + scratch.col_nnz.assign(num_cols, 0); + for (i_t i = 0; i < n_entries; ++i) { + const i_t r = entries.rows[i]; + const i_t c = entries.cols[i]; + scratch.col_nnz[c]++; + if (symmetrize_upper_triangular && r != c) { scratch.col_nnz[r]++; } + } + + scratch.col_off.resize(num_cols + 1); + scratch.col_off[0] = 0; + for (i_t c = 0; c < num_cols; ++c) { + scratch.col_off[c + 1] = scratch.col_off[c] + scratch.col_nnz[c]; + } + const i_t csc_nnz = scratch.col_off[num_cols]; + scratch.csc_rows.resize(csc_nnz); + scratch.csc_vals.resize(csc_nnz); + scratch.col_wr.resize(num_cols); + std::copy(scratch.col_off.begin(), scratch.col_off.begin() + num_cols, scratch.col_wr.begin()); + + for (i_t i = 0; i < n_entries; ++i) { + const i_t r = entries.rows[i]; + const i_t c = entries.cols[i]; + const f_t v = entries.vals[i]; + { + const i_t p = scratch.col_wr[c]++; + scratch.csc_rows[p] = r; + scratch.csc_vals[p] = v; + } + if (symmetrize_upper_triangular && r != c) { + const i_t p = scratch.col_wr[r]++; + scratch.csc_rows[p] = c; + scratch.csc_vals[p] = v; + } + } + + scratch.row_nnz.assign(num_rows, 0); + for (i_t cc = 0; cc < num_cols; ++cc) { + const i_t lo = scratch.col_off[cc]; + const i_t hi = scratch.col_off[cc + 1]; + for (i_t t = lo; t < hi; ++t) { + const i_t row = scratch.csc_rows[t]; + scratch.row_nnz[row]++; + } + } + + scratch.row_off.resize(num_rows + 1); + scratch.row_off[0] = 0; + for (i_t r = 0; r < num_rows; ++r) { + scratch.row_off[r + 1] = scratch.row_off[r] + scratch.row_nnz[r]; + } + const i_t csr_nnz = scratch.row_off[num_rows]; + + out_values.resize(csr_nnz); + out_indices.resize(csr_nnz); + scratch.row_wr.resize(num_rows); + std::copy(scratch.row_off.begin(), scratch.row_off.begin() + num_rows, scratch.row_wr.begin()); + + for (i_t cc = 0; cc < num_cols; ++cc) { + const i_t lo = scratch.col_off[cc]; + const i_t hi = scratch.col_off[cc + 1]; + for (i_t t = lo; t < hi; ++t) { + const i_t row = scratch.csc_rows[t]; + const f_t val = scratch.csc_vals[t]; + const i_t w = scratch.row_wr[row]++; + out_indices[w] = cc; + out_values[w] = val * value_scale; + } + } + + out_offsets = std::move(scratch.row_off); +} + +} // namespace + namespace cuopt::linear_programming::io { template @@ -288,115 +410,58 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.set_variable_types(std::move(var_types)); problem.set_maximize(maximize); - // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of - // O(nnz*log(nnz))) For QUADOBJ: handles upper triangular input by expanding to full symmetric - // matrix. + // Build CSR from (row,col,value) triples via double transpose (CSC then CSR). Uses flat buffers + // plus reusable scratch to avoid per-column/per-row `std::vector` allocations. // - // @p value_scale: - // QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); - // QCMATRIX uses 1.0 (symmetric Q defines xᵀQx directly in the constraint). - auto build_csr_via_transpose = [](const std::vector>& entries, - i_t num_rows, - i_t num_cols, - bool symmetrize_upper_triangular, - f_t value_scale) { - struct CSRResult { - std::vector values; - std::vector indices; - std::vector offsets; - }; - - if (entries.empty()) { - CSRResult result; - result.offsets.resize(num_rows + 1, 0); - return result; - } - - // First transpose: build CSC format (entries sorted by column) - std::vector>> csc_data(num_cols); - for (const auto& entry : entries) { - i_t row = std::get<0>(entry); - i_t col = std::get<1>(entry); - f_t val = std::get<2>(entry); - - // For QUADOBJ (upper triangular), add both (row,col) and (col,row) if off-diagonal - csc_data[col].emplace_back(row, val); - if (symmetrize_upper_triangular && row != col) { csc_data[row].emplace_back(col, val); } - } - - // Second transpose: convert CSC to CSR (entries sorted by row, columns within rows sorted) - std::vector>> csr_data(num_rows); - for (i_t col = 0; col < num_cols; ++col) { - for (const auto& [row, val] : csc_data[col]) { - csr_data[row].emplace_back(col, val); - } - } - - // Build final CSR format - CSRResult result; - result.offsets.reserve(num_rows + 1); - result.offsets.push_back(0); - - for (i_t row = 0; row < num_rows; ++row) { - for (const auto& [col, val] : csr_data[row]) { - // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q xExpand - // commentComment on line L488 so we have to multiply the value by value_scale=0.5 to get - // the correct value. - result.values.push_back(val * value_scale); - result.indices.push_back(col); - } - result.offsets.push_back(result.values.size()); - } - - return result; - }; + // value_scale: QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); QCMATRIX uses 1.0. + triples_to_csr_scratch_t triple_csr_scratch{}; + std::vector quad_csr_values{}; + std::vector quad_csr_indices{}; + std::vector quad_csr_offsets{}; // Process QUADOBJ data if present (upper triangular format) if (!quadobj_entries.empty()) { - // Convert quadratic objective entries to CSR format using double transpose - // QUADOBJ stores upper triangular elements, so we expand to full symmetric matrix constexpr f_t k_mps_quad_half_scale = f_t(0.5); // MPS ½ xᵀQx vs internal xᵀQx - auto csr_result = build_csr_via_transpose( - quadobj_entries, num_vars_for_quad, num_vars_for_quad, true, k_mps_quad_half_scale); - - // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) - problem.set_quadratic_objective_matrix( - csr_result.values, csr_result.indices, csr_result.offsets); + triples_to_csr_flat(quadobj_entries, + num_vars_for_quad, + num_vars_for_quad, + true, + k_mps_quad_half_scale, + triple_csr_scratch, + quad_csr_values, + quad_csr_indices, + quad_csr_offsets); + problem.set_quadratic_objective_matrix(quad_csr_values, quad_csr_indices, quad_csr_offsets); } else if (!qmatrix_entries.empty()) { - // Convert quadratic objective entries to CSR format using double transpose - // QMATRIX stores full symmetric matrix constexpr f_t k_mps_quad_half_scale = f_t(0.5); - auto csr_result = build_csr_via_transpose( - qmatrix_entries, num_vars_for_quad, num_vars_for_quad, false, k_mps_quad_half_scale); - - // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) - problem.set_quadratic_objective_matrix( - csr_result.values, csr_result.indices, csr_result.offsets); + triples_to_csr_flat(qmatrix_entries, + num_vars_for_quad, + num_vars_for_quad, + false, + k_mps_quad_half_scale, + triple_csr_scratch, + quad_csr_values, + quad_csr_indices, + quad_csr_offsets); + problem.set_quadratic_objective_matrix(quad_csr_values, quad_csr_indices, quad_csr_offsets); } // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs). - // Bundle row metadata, row-linear coefficients (from COLUMNS), rhs, and quadratic part together. - constexpr f_t k_qcmatrix_value_scale = f_t(1); - const i_t linear_row_count = static_cast(row_types.size() - quadratic_row_ids.size()); - i_t quadratic_row_id = 0; for (const auto& block : qcmatrix_blocks_) { - auto csr_result = build_csr_via_transpose( - block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); const i_t row_id = block.constraint_row_id; mps_parser_expects(row_id >= 0 && row_id < static_cast(row_types.size()), error_type_t::ValidationError, "QCMATRIX row index %d is out of range for constraints", static_cast(row_id)); - problem.append_quadratic_constraint(linear_row_count + quadratic_row_id, + problem.append_quadratic_constraint(row_id, row_names[row_id], static_cast(row_types[row_id]), A_values[row_id], A_indices[row_id], b_values[row_id], - csr_result.values, - csr_result.indices, - csr_result.offsets); - ++quadratic_row_id; + block.entries.vals, + block.entries.rows, + block.entries.cols); } if (!quadratic_row_ids.empty()) { @@ -694,9 +759,7 @@ mps_parser_t::mps_parser_t(mps_data_model_t& problem, // raft::common::nvtx::range fun_scope("mps parser"); std::vector buf = detail::file_to_string(file); - parse_string(buf.data()); - fill_problem(problem); } diff --git a/cpp/src/io/mps_parser_internal.hpp b/cpp/src/io/mps_parser_internal.hpp index af27083fcb..d510bd4f57 100644 --- a/cpp/src/io/mps_parser_internal.hpp +++ b/cpp/src/io/mps_parser_internal.hpp @@ -12,13 +12,37 @@ #include #include #include -#include #include #include #include namespace cuopt::linear_programming::io { +/** + * Sparse COO (coordinate) entries for a matrix: parallel row/col/val vectors. + */ +template +struct coo_entries_t { + std::vector rows{}; + std::vector cols{}; + std::vector vals{}; + + void emplace_back(i_t r, i_t c, f_t v) + { + rows.push_back(r); + cols.push_back(c); + vals.push_back(v); + } + bool empty() const { return rows.empty(); } + size_t size() const { return rows.size(); } + void clear() + { + rows.clear(); + cols.clear(); + vals.clear(); + } +}; + /** * @brief Different possible types of 'ROWS' */ @@ -128,8 +152,8 @@ class mps_parser_t { // QPS-specific data for quadratic programming /** Quadratic objective matrix entries */ - std::vector> quadobj_entries{}; - std::vector> qmatrix_entries{}; + coo_entries_t quadobj_entries{}; + coo_entries_t qmatrix_entries{}; private: bool inside_rows_{false}; @@ -148,12 +172,12 @@ class mps_parser_t { /** (free-format) QCMATRIX: finalized blocks (row id + triples) */ struct qcmatrix_raw_block_t { i_t constraint_row_id{}; - std::vector> entries{}; + coo_entries_t entries{}; }; std::vector qcmatrix_blocks_{}; /** Triples for the QCMATRIX block currently being read (-1 row id means none) */ i_t qcmatrix_active_row_id_{-1}; - std::vector> qcmatrix_current_entries_{}; + coo_entries_t qcmatrix_current_entries_{}; std::unordered_set encountered_sections{}; std::unordered_map row_names_map{}; diff --git a/cpp/src/io/mps_writer.cpp b/cpp/src/io/mps_writer.cpp index 73489277ce..9db528ca5b 100644 --- a/cpp/src/io/mps_writer.cpp +++ b/cpp/src/io/mps_writer.cpp @@ -157,7 +157,15 @@ void mps_writer_t::write(const std::string& mps_file_path) std::vector constraint_bounds(problem_.get_constraint_bounds().size()); std::vector variable_lower_bounds(problem_.get_variable_lower_bounds().size()); std::vector variable_upper_bounds(problem_.get_variable_upper_bounds().size()); - std::vector variable_types(problem_.get_variable_types().size()); + // Default unset variable types to continuous ('C'); API models often omit set_variable_types. + std::vector variable_types(static_cast(n_variables), 'C'); + { + const auto& src_types = problem_.get_variable_types(); + const size_t n_copy = std::min(src_types.size(), variable_types.size()); + for (size_t j = 0; j < n_copy; ++j) { + variable_types[j] = src_types[j]; + } + } std::vector row_types(problem_.get_row_types().size()); std::vector constraint_matrix_offsets(problem_.get_constraint_matrix_offsets().size()); std::vector constraint_matrix_indices(problem_.get_constraint_matrix_indices().size()); @@ -178,9 +186,6 @@ void mps_writer_t::write(const std::string& mps_file_path) problem_.get_variable_upper_bounds().data(), problem_.get_variable_upper_bounds().data() + problem_.get_variable_upper_bounds().size(), variable_upper_bounds.data()); - std::copy(problem_.get_variable_types().data(), - problem_.get_variable_types().data() + problem_.get_variable_types().size(), - variable_types.data()); std::copy(problem_.get_row_types().data(), problem_.get_row_types().data() + problem_.get_row_types().size(), row_types.data()); @@ -243,8 +248,8 @@ void mps_writer_t::write(const std::string& mps_file_path) const auto& qc = quadratic_constraints[q]; std::string row_name = qc.constraint_row_name.empty() ? "QC" + std::to_string(q) : qc.constraint_row_name; - // Quadratic rows are currently restricted to MPS 'L' (<=). - mps_file << " L " << row_name << "\n"; + char const type = qc.constraint_row_type; + mps_file << " " << type << " " << row_name << "\n"; } // COLUMNS section @@ -274,18 +279,15 @@ void mps_writer_t::write(const std::string& mps_file_path) } // Quadratic constraint rows omit linear coefficients from global A; add them from QC bundles. + // QP/QCQP models are continuous-only (no integer variables). if (problem_.has_quadratic_constraints()) { for (size_t q = 0; q < quadratic_constraints.size(); ++q) { const auto& qc = quadratic_constraints[q]; - const size_t row_id = static_cast(n_constraints) + q; + const size_t row_id = n_constraints + q; for (size_t t = 0; t < qc.linear_indices.size(); ++t) { - size_t var = static_cast(qc.linear_indices[t]); - f_t val = qc.linear_values[t]; - if (variable_types[var] == 'I') { - integral_col_nnzs[var].emplace_back(row_id, val); - } else { - continuous_col_nnzs[var].emplace_back(row_id, val); - } + i_t var = qc.linear_indices[t]; + f_t val = qc.linear_values[t]; + continuous_col_nnzs[var].emplace_back(row_id, val); var_in_constraint[var] = true; } } @@ -324,14 +326,13 @@ void mps_writer_t::write(const std::string& mps_file_path) : "C" + std::to_string(var_id); for (auto& nnz : nnzs) { std::string row_name; - if (static_cast(nnz.first) < static_cast(n_constraints)) { + if (nnz.first < static_cast(n_constraints)) { // Linear rows: do not use row-name count here—names are optional; row id is 0..m-1. - row_name = static_cast(nnz.first) < problem_.get_row_names().size() + row_name = nnz.first < problem_.get_row_names().size() ? problem_.get_row_names()[nnz.first] : "R" + std::to_string(nnz.first); - } else if (static_cast(nnz.first) < - static_cast(n_constraints) + quadratic_constraints.size()) { - const size_t q = static_cast(nnz.first) - static_cast(n_constraints); + } else if (nnz.first < static_cast(n_constraints) + quadratic_constraints.size()) { + const size_t q = nnz.first - static_cast(n_constraints); row_name = quadratic_constraints[q].constraint_row_name.empty() ? "QC" + std::to_string(q) : quadratic_constraints[q].constraint_row_name; @@ -497,20 +498,19 @@ void mps_writer_t::write(const std::string& mps_file_path) if (problem_.has_quadratic_constraints()) { for (const auto& qc : problem_.get_quadratic_constraints()) { mps_file << "QCMATRIX " << qc.constraint_row_name << "\n"; - const i_t n_quad_rows = static_cast(qc.quadratic_offsets.size()) - 1; - for (i_t i = 0; i < n_quad_rows; ++i) { + const i_t nnz = qc.vals.size(); + for (i_t p = 0; p < nnz; ++p) { + const i_t i = qc.rows[p]; + const i_t j = qc.cols[p]; + f_t v = qc.vals[p]; std::string row_var_name = static_cast(i) < problem_.get_variable_names().size() ? problem_.get_variable_names()[i] : "C" + std::to_string(i); - for (i_t p = qc.quadratic_offsets[i]; p < qc.quadratic_offsets[i + 1]; ++p) { - i_t j = qc.quadratic_indices[p]; - f_t v = qc.quadratic_values[p]; - std::string col_var_name = static_cast(j) < problem_.get_variable_names().size() - ? problem_.get_variable_names()[j] - : "C" + std::to_string(j); - if (v != f_t(0)) { - mps_file << " " << row_var_name << " " << col_var_name << " " << v << "\n"; - } + std::string col_var_name = static_cast(j) < problem_.get_variable_names().size() + ? problem_.get_variable_names()[j] + : "C" + std::to_string(j); + if (v != 0) { + mps_file << " " << row_var_name << " " << col_var_name << " " << v << "\n"; } } } diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index 0f669a0321..5bb3e25cad 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -143,54 +143,34 @@ void cpu_optimization_problem_t::set_quadratic_constraints( } template -void cpu_optimization_problem_t::add_quadratic_constraint(char constraint_row_type, - f_t rhs_value, - const f_t* quadratic_values, - i_t size_quadratic_values, - const i_t* quadratic_indices, - i_t size_quadratic_indices, - const i_t* quadratic_offsets, - i_t size_quadratic_offsets, - const f_t* linear_values, - i_t size_linear_values, - const i_t* linear_indices, - i_t size_linear_indices) -{ - cuopt_expects(size_quadratic_offsets >= 1, +void cpu_optimization_problem_t::add_quadratic_constraint( + char constraint_row_type, + f_t rhs_value, + std::span row_index, + std::span col_index, + std::span coeff, + std::span linear_values, + std::span linear_indices) +{ + cuopt_expects(!row_index.empty(), error_type_t::ValidationError, - "quadratic_offsets must have at least one element"); - cuopt_expects(quadratic_offsets != nullptr, + "quadratic constraint must have at least one matrix entry"); + cuopt_expects(row_index.size() == col_index.size() && row_index.size() == coeff.size(), error_type_t::ValidationError, - "quadratic_offsets cannot be null"); - cuopt_expects(size_linear_values == size_linear_indices, + "row_index, col_index, and coeff must have the same size"); + cuopt_expects(linear_values.size() == linear_indices.size(), error_type_t::ValidationError, "linear_values and linear_indices must have the same size"); - if (size_quadratic_values != 0) { - cuopt_expects(quadratic_values != nullptr, - error_type_t::ValidationError, - "quadratic_values cannot be null"); - } - if (size_quadratic_indices != 0) { - cuopt_expects(quadratic_indices != nullptr, - error_type_t::ValidationError, - "quadratic_indices cannot be null"); - } - if (size_linear_values != 0) { - cuopt_expects( - linear_values != nullptr, error_type_t::ValidationError, "linear_values cannot be null"); - cuopt_expects( - linear_indices != nullptr, error_type_t::ValidationError, "linear_indices cannot be null"); - } typename optimization_problem_interface_t::quadratic_constraint_t qc; qc.constraint_row_index = get_n_constraints() + static_cast(quadratic_constraints_.size()); qc.constraint_row_type = constraint_row_type; qc.rhs_value = rhs_value; - qc.quadratic_values.assign(quadratic_values, quadratic_values + size_quadratic_values); - qc.quadratic_indices.assign(quadratic_indices, quadratic_indices + size_quadratic_indices); - qc.quadratic_offsets.assign(quadratic_offsets, quadratic_offsets + size_quadratic_offsets); - qc.linear_values.assign(linear_values, linear_values + size_linear_values); - qc.linear_indices.assign(linear_indices, linear_indices + size_linear_indices); + qc.rows.assign(row_index.begin(), row_index.end()); + qc.cols.assign(col_index.begin(), col_index.end()); + qc.vals.assign(coeff.begin(), coeff.end()); + qc.linear_values.assign(linear_values.begin(), linear_values.end()); + qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); quadratic_constraints_.push_back(std::move(qc)); } diff --git a/cpp/src/pdlp/cuopt_c.cpp b/cpp/src/pdlp/cuopt_c.cpp index 4571b80743..fa27e4504b 100644 --- a/cpp/src/pdlp/cuopt_c.cpp +++ b/cpp/src/pdlp/cuopt_c.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -603,32 +604,23 @@ cuopt_int_t cuOptAddQuadraticConstraint(cuOptOptimizationProblem problem, } try { - std::vector Q_offsets; - std::vector Q_indices; - std::vector Q_values; - coo_to_csr(quad_num_entries, - row_index, - col_index, - coeff, - num_variables, - num_variables, - Q_offsets, - Q_indices, - Q_values); - if (Q_offsets.empty()) { return CUOPT_INVALID_ARGUMENT; } - - op_problem->add_quadratic_constraint(sense, - rhs, - Q_values.data(), - static_cast(Q_values.size()), - Q_indices.data(), - static_cast(Q_indices.size()), - Q_offsets.data(), - static_cast(Q_offsets.size()), - linear_coeff, - num_lin_entries, - linear_index, - num_lin_entries); + const auto row_index_span = + std::span(row_index, static_cast(quad_num_entries)); + const auto col_index_span = + std::span(col_index, static_cast(quad_num_entries)); + const auto coeff_span = + std::span(coeff, static_cast(quad_num_entries)); + const auto linear_coeff_span = + num_lin_entries == 0 + ? std::span{} + : std::span(linear_coeff, static_cast(num_lin_entries)); + const auto linear_index_span = + num_lin_entries == 0 + ? std::span{} + : std::span(linear_index, static_cast(num_lin_entries)); + + op_problem->add_quadratic_constraint( + sense, rhs, row_index_span, col_index_span, coeff_span, linear_coeff_span, linear_index_span); } catch (const raft::exception&) { return CUOPT_INVALID_ARGUMENT; } catch (const std::exception&) { diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu index 8b68009a82..70b9fb7e68 100644 --- a/cpp/src/pdlp/optimization_problem.cu +++ b/cpp/src/pdlp/optimization_problem.cu @@ -217,52 +217,31 @@ void optimization_problem_t::set_quadratic_constraints( template void optimization_problem_t::add_quadratic_constraint(char constraint_row_type, f_t rhs_value, - const f_t* quadratic_values, - i_t size_quadratic_values, - const i_t* quadratic_indices, - i_t size_quadratic_indices, - const i_t* quadratic_offsets, - i_t size_quadratic_offsets, - const f_t* linear_values, - i_t size_linear_values, - const i_t* linear_indices, - i_t size_linear_indices) -{ - cuopt_expects(size_quadratic_offsets >= 1, + std::span row_index, + std::span col_index, + std::span coeff, + std::span linear_values, + std::span linear_indices) +{ + cuopt_expects(!row_index.empty(), error_type_t::ValidationError, - "quadratic_offsets must have at least one element"); - cuopt_expects(quadratic_offsets != nullptr, + "quadratic constraint must have at least one matrix entry"); + cuopt_expects(row_index.size() == col_index.size() && row_index.size() == coeff.size(), error_type_t::ValidationError, - "quadratic_offsets cannot be null"); - cuopt_expects(size_linear_values == size_linear_indices, + "row_index, col_index, and coeff must have the same size"); + cuopt_expects(linear_values.size() == linear_indices.size(), error_type_t::ValidationError, "linear_values and linear_indices must have the same size"); - if (size_quadratic_values != 0) { - cuopt_expects(quadratic_values != nullptr, - error_type_t::ValidationError, - "quadratic_values cannot be null"); - } - if (size_quadratic_indices != 0) { - cuopt_expects(quadratic_indices != nullptr, - error_type_t::ValidationError, - "quadratic_indices cannot be null"); - } - if (size_linear_values != 0) { - cuopt_expects( - linear_values != nullptr, error_type_t::ValidationError, "linear_values cannot be null"); - cuopt_expects( - linear_indices != nullptr, error_type_t::ValidationError, "linear_indices cannot be null"); - } typename optimization_problem_interface_t::quadratic_constraint_t qc; qc.constraint_row_index = get_n_constraints() + static_cast(quadratic_constraints_.size()); qc.constraint_row_type = constraint_row_type; qc.rhs_value = rhs_value; - qc.quadratic_values.assign(quadratic_values, quadratic_values + size_quadratic_values); - qc.quadratic_indices.assign(quadratic_indices, quadratic_indices + size_quadratic_indices); - qc.quadratic_offsets.assign(quadratic_offsets, quadratic_offsets + size_quadratic_offsets); - qc.linear_values.assign(linear_values, linear_values + size_linear_values); - qc.linear_indices.assign(linear_indices, linear_indices + size_linear_indices); + qc.rows.assign(row_index.begin(), row_index.end()); + qc.cols.assign(col_index.begin(), col_index.end()); + qc.vals.assign(coeff.begin(), coeff.end()); + qc.linear_values.assign(linear_values.begin(), linear_values.end()); + qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); quadratic_constraints_.push_back(std::move(qc)); } diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 076952811e..07a4676120 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -507,6 +507,7 @@ run_barrier(dual_simplex::user_problem_t& user_problem, barrier_settings.ordering = settings.ordering; barrier_settings.barrier_dual_initial_point = settings.barrier_dual_initial_point; barrier_settings.barrier = true; + barrier_settings.barrier_presolve = true; barrier_settings.crossover = settings.crossover; barrier_settings.eliminate_dense_columns = settings.eliminate_dense_columns; barrier_settings.barrier_iterative_refinement = settings.barrier_iterative_refinement; @@ -524,6 +525,8 @@ run_barrier(dual_simplex::user_problem_t& user_problem, auto status = dual_simplex::solve_linear_program_with_barrier( user_problem, barrier_settings, timer.get_tic_start(), solution); + detail::project_barrier_solution_to_model_variables(user_problem, solution); + CUOPT_LOG_CONDITIONAL_INFO( !settings.inside_mip, "Barrier finished in %.2f seconds", timer.elapsed_time()); @@ -1779,9 +1782,10 @@ optimization_problem_solution_t solve_lp_with_method( } template -optimization_problem_solution_t solve_qp(optimization_problem_t& op_problem, - pdlp_solver_settings_t const& settings, - bool problem_checking) +optimization_problem_solution_t solve_qcqp( + optimization_problem_t& op_problem, + pdlp_solver_settings_t const& settings, + bool problem_checking) { try { // Create log stream for file logging and add it to default logger @@ -1791,9 +1795,38 @@ optimization_problem_solution_t solve_qp(optimization_problem_t::check_problem_representation(op_problem); + if (problem_checking_t::has_crossing_bounds(op_problem)) { + return optimization_problem_solution_t( + pdlp_termination_status_t::PrimalInfeasible, op_problem.get_handle_ptr()->get_stream()); + } + } - raft::common::nvtx::range fun_scope("Running QP solver"); + if (op_problem.has_quadratic_objective() && op_problem.get_sense()) { + CUOPT_LOG_ERROR("Quadratic problems must be minimized"); + return optimization_problem_solution_t(pdlp_termination_status_t::NumericalError, + op_problem.get_handle_ptr()->get_stream()); + } + + raft::common::nvtx::range fun_scope("Running QCQP solver"); + const bool has_q_obj = op_problem.has_quadratic_objective(); + const bool has_qc = op_problem.has_quadratic_constraints(); + if (has_q_obj && has_qc) { + CUOPT_LOG_INFO( + "Problem has a quadratic objective and %d quadratic constraints. Converting constraints to " + "second-order cones and solving with barrier.", + static_cast(op_problem.get_quadratic_constraints().size())); + } else if (has_q_obj) { + CUOPT_LOG_INFO("Problem has a quadratic objective. Solving with barrier."); + } else { + CUOPT_LOG_INFO( + "Problem has %d quadratic constraints. Converting to second-order cones and solving with " + "barrier.", + static_cast(op_problem.get_quadratic_constraints().size())); + } if (settings.user_problem_file != "") { CUOPT_LOG_INFO("Writing user problem to file: %s", settings.user_problem_file.c_str()); op_problem.write_to_mps(settings.user_problem_file); @@ -1801,7 +1834,7 @@ optimization_problem_solution_t solve_qp(optimization_problem_t dual_simplex_problem = cuopt_optimization_problem_to_user_problem(op_problem.get_handle_ptr(), op_problem); - auto sol_dual_simplex = run_barrier(dual_simplex_problem, settings, qp_timer); + auto sol_dual_simplex = run_barrier(dual_simplex_problem, settings, qcqp_timer); auto solution = convert_dual_simplex_sol(op_problem, std::get<0>(sol_dual_simplex), std::get<1>(sol_dual_simplex), @@ -1815,10 +1848,10 @@ optimization_problem_solution_t solve_qp(optimization_problem_t{e, op_problem.get_handle_ptr()->get_stream()}; } catch (const std::bad_alloc& e) { - CUOPT_LOG_ERROR("Error in solve_qp: %s", e.what()); + CUOPT_LOG_ERROR("Error in solve_qcqp: %s", e.what()); return optimization_problem_solution_t{ cuopt::logic_error("Memory allocation failed", cuopt::error_type_t::RuntimeError), op_problem.get_handle_ptr()->get_stream()}; @@ -1833,8 +1866,8 @@ optimization_problem_solution_t solve_lp( bool use_pdlp_solver_mode, bool is_batch_mode) { - if (op_problem.has_quadratic_objective()) { - return solve_qp(op_problem, settings_const, problem_checking); + if (op_problem.has_quadratic_objective() || op_problem.has_quadratic_constraints()) { + return solve_qcqp(op_problem, settings_const, problem_checking); } try { @@ -2078,12 +2111,18 @@ cuopt::linear_programming::optimization_problem_t mps_data_model_to_op cuopt::linear_programming::optimization_problem_t op_problem(handle_ptr); op_problem.set_maximize(data_model.get_sense()); - op_problem.set_csr_constraint_matrix(data_model.get_constraint_matrix_values().data(), - data_model.get_constraint_matrix_values().size(), - data_model.get_constraint_matrix_indices().data(), - data_model.get_constraint_matrix_indices().size(), - data_model.get_constraint_matrix_offsets().data(), - data_model.get_constraint_matrix_offsets().size()); + if (data_model.get_constraint_matrix_values().size() != 0) { + op_problem.set_csr_constraint_matrix(data_model.get_constraint_matrix_values().data(), + data_model.get_constraint_matrix_values().size(), + data_model.get_constraint_matrix_indices().data(), + data_model.get_constraint_matrix_indices().size(), + data_model.get_constraint_matrix_offsets().data(), + data_model.get_constraint_matrix_offsets().size()); + } else { + // Set empty constraint matrix + std::vector offsets(1, 0); + op_problem.set_csr_constraint_matrix(nullptr, 0, nullptr, 0, offsets.data(), 1); + } if (data_model.get_constraint_bounds().size() != 0) { op_problem.set_constraint_bounds(data_model.get_constraint_bounds().data(), diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index 7d78da80eb..774582a555 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -12,10 +12,16 @@ #include #include +#include + +#include #include #include +#include +#include +#include namespace cuopt::linear_programming { @@ -111,8 +117,6 @@ static dual_simplex::user_problem_t cuopt_problem_to_user_problem( csr_A.j = std::vector(cuopt::host_copy(model.variables, handle_ptr->get_stream())); csr_A.row_start = std::vector(cuopt::host_copy(model.offsets, handle_ptr->get_stream())); - csr_A.to_compressed_col(user_problem.A); - user_problem.rhs.resize(m); user_problem.row_sense.resize(m); user_problem.range_rows.clear(); @@ -186,6 +190,13 @@ static dual_simplex::user_problem_t cuopt_problem_to_user_problem( user_problem.Q_indices = model.Q_indices; user_problem.Q_values = model.Q_values; + if (model.original_problem_ptr->has_quadratic_constraints()) { + detail::convert_quadratic_constraints_to_second_order_cones( + n, model.original_problem_ptr->get_quadratic_constraints(), csr_A, user_problem); + } + + csr_A.to_compressed_col(user_problem.A); + return user_problem; } @@ -211,7 +222,6 @@ static dual_simplex::user_problem_t cuopt_optimization_problem_to_user csr_A.row_start.resize(1); csr_A.row_start[0] = 0; } - csr_A.to_compressed_col(user_problem.A); user_problem.rhs.resize(m); user_problem.row_sense.resize(m); @@ -291,6 +301,13 @@ static dual_simplex::user_problem_t cuopt_optimization_problem_to_user user_problem.Q_indices = model.get_quadratic_objective_indices(); user_problem.Q_values = model.get_quadratic_objective_values(); + if (model.has_quadratic_constraints()) { + detail::convert_quadratic_constraints_to_second_order_cones( + static_cast(n), model.get_quadratic_constraints(), csr_A, user_problem); + } + + csr_A.to_compressed_col(user_problem.A); + return user_problem; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3642044551..19bb27d593 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -102,6 +102,7 @@ if(BUILD_TESTS) endif() add_subdirectory(linear_programming) add_subdirectory(dual_simplex) + add_subdirectory(socp) add_subdirectory(qp) add_subdirectory(utilities) diff --git a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu index 0c8c591e97..0141a4eae6 100644 --- a/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu +++ b/cpp/tests/dual_simplex/unit_tests/solve_barrier.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_test.c b/cpp/tests/linear_programming/c_api_tests/c_api_test.c index 2500e2e9d4..6a587e0da7 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_test.c +++ b/cpp/tests/linear_programming/c_api_tests/c_api_test.c @@ -1502,6 +1502,126 @@ cuopt_int_t test_quadratic_ranged_problem(cuopt_int_t* termination_status_ptr, return status; } +cuopt_int_t test_quadratic_constraint_problem(cuopt_int_t* termination_status_ptr, + cuopt_float_t* objective_ptr, + cuopt_float_t* solution_values) +{ + cuOptOptimizationProblem problem = NULL; + cuOptSolverSettings settings = NULL; + cuOptSolution solution = NULL; + + // Same QCQP as python/cuopt/cuopt/tests/socp/test_socp.py::build_socp_1: + // min 3*x0 + 2*x1 + x2 + // s.t. x0^2 + x1^2 + x2^2 - y^2 <= 0 + // x0 + x1 + 3*x2 >= 1 + // 0 <= y <= 5 + // (x0, x1, x2 free) + cuopt_int_t num_variables = 4; + cuopt_int_t num_linear_constraints = 1; + cuopt_int_t objective_sense = CUOPT_MINIMIZE; + cuopt_float_t objective_offset = 0.0; + cuopt_float_t objective_coefficients[] = {3.0, 2.0, 1.0, 0.0}; + + cuopt_int_t row_offsets[] = {0, 3}; + cuopt_int_t column_indices[] = {0, 1, 2}; + cuopt_float_t values[] = {1.0, 1.0, 3.0}; + + cuopt_float_t constraint_bounds[] = {1.0}; + char constraint_sense[] = {CUOPT_GREATER_THAN}; + + cuopt_float_t var_lower_bounds[] = {-CUOPT_INFINITY, -CUOPT_INFINITY, -CUOPT_INFINITY, 0.0}; + cuopt_float_t var_upper_bounds[] = { + CUOPT_INFINITY, CUOPT_INFINITY, CUOPT_INFINITY, 5.0}; + char variable_types[] = {CUOPT_CONTINUOUS, + CUOPT_CONTINUOUS, + CUOPT_CONTINUOUS, + CUOPT_CONTINUOUS}; + + cuopt_int_t qc_row_index[] = {0, 1, 2, 3}; + cuopt_int_t qc_col_index[] = {0, 1, 2, 3}; + cuopt_float_t qc_coeff[] = {1.0, 1.0, 1.0, -1.0}; + + cuopt_int_t status; + + status = cuOptCreateProblem(num_linear_constraints, + num_variables, + objective_sense, + objective_offset, + objective_coefficients, + row_offsets, + column_indices, + values, + constraint_sense, + constraint_bounds, + var_lower_bounds, + var_upper_bounds, + variable_types, + &problem); + + if (status != CUOPT_SUCCESS) { + printf("Error creating problem: %d\n", status); + goto DONE; + } + + status = cuOptAddQuadraticConstraint(problem, + 4, + qc_row_index, + qc_col_index, + qc_coeff, + 0, + NULL, + NULL, + CUOPT_LESS_THAN, + 0.0); + if (status != CUOPT_SUCCESS) { + printf("Error adding quadratic constraint: %d\n", status); + goto DONE; + } + + status = cuOptCreateSolverSettings(&settings); + if (status != CUOPT_SUCCESS) { + printf("Error creating solver settings: %d\n", status); + goto DONE; + } + + status = cuOptSetIntegerParameter(settings, CUOPT_METHOD, CUOPT_METHOD_BARRIER); + if (status != CUOPT_SUCCESS) { + printf("Error setting barrier method: %d\n", status); + goto DONE; + } + + status = cuOptSolve(problem, settings, &solution); + if (status != CUOPT_SUCCESS) { + printf("Error solving problem: %d\n", status); + goto DONE; + } + + status = cuOptGetTerminationStatus(solution, termination_status_ptr); + if (status != CUOPT_SUCCESS) { + printf("Error getting termination status: %d\n", status); + goto DONE; + } + + status = cuOptGetObjectiveValue(solution, objective_ptr); + if (status != CUOPT_SUCCESS) { + printf("Error getting objective value: %d\n", status); + goto DONE; + } + + status = cuOptGetPrimalSolution(solution, solution_values); + if (status != CUOPT_SUCCESS) { + printf("Error getting primal solution: %d\n", status); + goto DONE; + } + +DONE: + cuOptDestroyProblem(&problem); + cuOptDestroySolverSettings(&settings); + cuOptDestroySolution(&solution); + + return status; +} + cuopt_int_t test_write_problem(const char* input_filename, const char* output_filename) { cuOptOptimizationProblem problem = NULL; diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp index 2a70c5a67d..1cf4bbaf9d 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp +++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp @@ -226,6 +226,21 @@ TEST(c_api, test_quadratic_ranged_problem) EXPECT_NEAR(objective, -32.0, 1e-3); } +TEST(c_api, test_quadratic_constraint_problem) +{ + cuopt_int_t termination_status; + cuopt_float_t objective; + cuopt_float_t solution_values[4]; + EXPECT_EQ(test_quadratic_constraint_problem(&termination_status, &objective, solution_values), + CUOPT_SUCCESS); + EXPECT_EQ(termination_status, CUOPT_TERMINATION_STATUS_OPTIMAL); + EXPECT_NEAR(objective, -13.548638904065102, 1e-4); + EXPECT_NEAR(solution_values[0], -3.874621860638774, 1e-4); + EXPECT_NEAR(solution_values[1], -2.129788233677883, 1e-4); + EXPECT_NEAR(solution_values[2], 2.33480343377204, 1e-4); + EXPECT_NEAR(solution_values[3], 5.0, 1e-4); +} + TEST(c_api, test_write_problem) { const std::string& rapidsDatasetRootDir = cuopt::test::get_rapids_dataset_root_dir(); diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.h b/cpp/tests/linear_programming/c_api_tests/c_api_tests.h index 7720cb2c0d..4a7ce8dcaf 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.h +++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.h @@ -39,6 +39,9 @@ cuopt_int_t test_quadratic_problem(cuopt_int_t* termination_status_ptr, cuopt_float_t* objective_ptr); cuopt_int_t test_quadratic_ranged_problem(cuopt_int_t* termination_status_ptr, cuopt_float_t* objective_ptr); +cuopt_int_t test_quadratic_constraint_problem(cuopt_int_t* termination_status_ptr, + cuopt_float_t* objective_ptr, + cuopt_float_t* solution_values); cuopt_int_t test_write_problem(const char* input_filename, const char* output_filename); cuopt_int_t test_maximize_problem_dual_variables(cuopt_int_t method, cuopt_int_t* termination_status_ptr, diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp index 3b93f76f72..af1368865d 100644 --- a/cpp/tests/linear_programming/parser_test.cpp +++ b/cpp/tests/linear_programming/parser_test.cpp @@ -2155,11 +2155,12 @@ End EXPECT_EQ(qc.constraint_row_type, static_cast(LesserThanOrEqual)); EXPECT_NEAR(qc.rhs_value, 10.0, tolerance); EXPECT_TRUE(qc.linear_indices.empty()); - // Q = diag(1, 1). CSR: offsets=[0, 1, 2], indices=[0, 1], values=[1, 1]. - EXPECT_EQ(qc.quadratic_offsets, (std::vector{0, 1, 2})); - ASSERT_EQ(qc.quadratic_values.size(), 2u); - EXPECT_NEAR(qc.quadratic_values[0], 1.0, tolerance); - EXPECT_NEAR(qc.quadratic_values[1], 1.0, tolerance); + // Q = diag(1, 1) stored as COO triplets (row, col, value). + EXPECT_EQ(qc.rows, (std::vector{0, 1})); + EXPECT_EQ(qc.cols, (std::vector{0, 1})); + ASSERT_EQ(qc.vals.size(), 2u); + EXPECT_NEAR(qc.vals[0], 1.0, tolerance); + EXPECT_NEAR(qc.vals[1], 1.0, tolerance); } TEST(lp_parser, qc_cross_term_splits_symmetrically) @@ -2175,13 +2176,14 @@ End )LP"); ASSERT_EQ(m.get_quadratic_constraints().size(), 1u); const auto& qc = nth_qc(m, 0); - // Q has 4 entries (all of [[1,2],[2,1]]). - EXPECT_EQ(qc.quadratic_offsets, (std::vector{0, 2, 4})); - ASSERT_EQ(qc.quadratic_values.size(), 4u); - EXPECT_NEAR(qc.quadratic_values[0], 1.0, tolerance); // (0, 0) - EXPECT_NEAR(qc.quadratic_values[1], 2.0, tolerance); // (0, 1) - EXPECT_NEAR(qc.quadratic_values[2], 2.0, tolerance); // (1, 0) - EXPECT_NEAR(qc.quadratic_values[3], 1.0, tolerance); // (1, 1) + // Q has 4 entries (all of [[1,2],[2,1]]) stored as COO triplets. + EXPECT_EQ(qc.rows, (std::vector{0, 0, 1, 1})); + EXPECT_EQ(qc.cols, (std::vector{0, 1, 0, 1})); + ASSERT_EQ(qc.vals.size(), 4u); + EXPECT_NEAR(qc.vals[0], 1.0, tolerance); // (0, 0) + EXPECT_NEAR(qc.vals[1], 2.0, tolerance); // (0, 1) + EXPECT_NEAR(qc.vals[2], 2.0, tolerance); // (1, 0) + EXPECT_NEAR(qc.vals[3], 1.0, tolerance); // (1, 1) } TEST(lp_parser, qc_linear_and_quadratic_mixed) @@ -2248,8 +2250,8 @@ End ASSERT_EQ(m.get_quadratic_constraints().size(), 1u); const auto& qc = nth_qc(m, 0); EXPECT_NEAR(qc.rhs_value, 5.0, tolerance); - ASSERT_EQ(qc.quadratic_values.size(), 1u); - EXPECT_NEAR(qc.quadratic_values[0], -1.0, tolerance); + ASSERT_EQ(qc.vals.size(), 1u); + EXPECT_NEAR(qc.vals[0], -1.0, tolerance); ASSERT_EQ(qc.linear_indices.size(), 1u); EXPECT_NEAR(qc.linear_values[0], -2.0, tolerance); } @@ -2729,17 +2731,17 @@ TEST(qps_parser, qcmatrix_append_api) // Validate default-constructed struct shape. model_t::quadratic_constraint_t default_qcm; EXPECT_EQ(0, default_qcm.constraint_row_index); - EXPECT_TRUE(default_qcm.quadratic_values.empty()); - EXPECT_TRUE(default_qcm.quadratic_indices.empty()); - EXPECT_TRUE(default_qcm.quadratic_offsets.empty()); + EXPECT_TRUE(default_qcm.vals.empty()); + EXPECT_TRUE(default_qcm.rows.empty()); + EXPECT_TRUE(default_qcm.cols.empty()); EXPECT_TRUE(default_qcm.linear_values.empty()); EXPECT_TRUE(default_qcm.linear_indices.empty()); EXPECT_EQ(0.0, default_qcm.rhs_value); // QC0: [[10, 2], [2, 2]] const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; - const std::vector qc0_indices = {0, 1, 0, 1}; - const std::vector qc0_offsets = {0, 2, 4}; + const std::vector qc0_row_indices = {0, 0, 1, 1}; + const std::vector qc0_col_indices = {0, 1, 0, 1}; const std::vector qc0_linear_values = {1.0, 1.0}; const std::vector qc0_linear_indices = {0, 1}; model.append_quadratic_constraint(0, @@ -2749,13 +2751,13 @@ TEST(qps_parser, qcmatrix_append_api) qc0_linear_indices, 5.0, qc0_values, - qc0_indices, - qc0_offsets); + qc0_row_indices, + qc0_col_indices); // QC1: [[4, 1], [1, 6]] const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; - const std::vector qc1_indices = {0, 1, 0, 1}; - const std::vector qc1_offsets = {0, 2, 4}; + const std::vector qc1_row_indices = {0, 0, 1, 1}; + const std::vector qc1_col_indices = {0, 1, 0, 1}; const std::vector qc1_linear_values = {3.0, 1.0}; const std::vector qc1_linear_indices = {0, 1}; model.append_quadratic_constraint(1, @@ -2765,8 +2767,8 @@ TEST(qps_parser, qcmatrix_append_api) qc1_linear_indices, 10.0, qc1_values, - qc1_indices, - qc1_offsets); + qc1_row_indices, + qc1_col_indices); ASSERT_TRUE(model.has_quadratic_constraints()); const auto& qcs = model.get_quadratic_constraints(); @@ -2778,9 +2780,9 @@ TEST(qps_parser, qcmatrix_append_api) EXPECT_EQ(qc0_linear_values, qcs[0].linear_values); EXPECT_EQ(qc0_linear_indices, qcs[0].linear_indices); EXPECT_EQ(5.0, qcs[0].rhs_value); - EXPECT_EQ(qc0_values, qcs[0].quadratic_values); - EXPECT_EQ(qc0_indices, qcs[0].quadratic_indices); - EXPECT_EQ(qc0_offsets, qcs[0].quadratic_offsets); + EXPECT_EQ(qc0_values, qcs[0].vals); + EXPECT_EQ(qc0_row_indices, qcs[0].rows); + EXPECT_EQ(qc0_col_indices, qcs[0].cols); EXPECT_EQ(1, qcs[1].constraint_row_index); EXPECT_EQ("QC1", qcs[1].constraint_row_name); @@ -2788,9 +2790,9 @@ TEST(qps_parser, qcmatrix_append_api) EXPECT_EQ(qc1_linear_values, qcs[1].linear_values); EXPECT_EQ(qc1_linear_indices, qcs[1].linear_indices); EXPECT_EQ(10.0, qcs[1].rhs_value); - EXPECT_EQ(qc1_values, qcs[1].quadratic_values); - EXPECT_EQ(qc1_indices, qcs[1].quadratic_indices); - EXPECT_EQ(qc1_offsets, qcs[1].quadratic_offsets); + EXPECT_EQ(qc1_values, qcs[1].vals); + EXPECT_EQ(qc1_row_indices, qcs[1].rows); + EXPECT_EQ(qc1_col_indices, qcs[1].cols); } // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic. @@ -2834,7 +2836,7 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) EXPECT_EQ(0, qcs[0].linear_indices[0]); EXPECT_EQ(1, qcs[0].linear_indices[1]); EXPECT_DOUBLE_EQ(5.0, qcs[0].rhs_value); - EXPECT_FALSE(qcs[0].quadratic_values.empty()); + EXPECT_FALSE(qcs[0].vals.empty()); // QC1: 3*x1 + x2 + xᵀQ₁x ≤ 10 EXPECT_EQ(2, qcs[1].constraint_row_index); @@ -2871,7 +2873,7 @@ TEST(qps_parser, qcqp_p0033_mps_sections) EXPECT_EQ(static_cast(c159_it - vnames.begin()), qcs[0].linear_indices[0]); EXPECT_DOUBLE_EQ(1.0, qcs[0].rhs_value); - EXPECT_FALSE(qcs[0].quadratic_values.empty()); + EXPECT_FALSE(qcs[0].vals.empty()); } TEST(mps_roundtrip, qcqp_p0033_qc1) diff --git a/cpp/tests/qp/unit_tests/no_constraints.cu b/cpp/tests/qp/unit_tests/no_constraints.cu index 75190b37fc..49159679fc 100644 --- a/cpp/tests/qp/unit_tests/no_constraints.cu +++ b/cpp/tests/qp/unit_tests/no_constraints.cu @@ -37,7 +37,8 @@ TEST(no_constraints_test, simple_test) int A_offsets_host[] = {0}; op_problem.set_csr_constraint_matrix(A_values_host, 0, A_indices_host, 0, A_offsets_host, 1); - double lb_host[] = {0.0, 0.0}; + double lb_host[] = {-std::numeric_limits::infinity(), + -std::numeric_limits::infinity()}; double ub_host[] = {std::numeric_limits::infinity(), std::numeric_limits::infinity()}; op_problem.set_variable_lower_bounds(lb_host, 2); diff --git a/cpp/tests/socp/CMakeLists.txt b/cpp/tests/socp/CMakeLists.txt new file mode 100644 index 0000000000..d53049b2d2 --- /dev/null +++ b/cpp/tests/socp/CMakeLists.txt @@ -0,0 +1,9 @@ +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on + +ConfigureTest(SOCP_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/second_order_cone_kernels.cu + ${CMAKE_CURRENT_SOURCE_DIR}/solve_barrier_socp.cu + LABELS numopt) diff --git a/cpp/tests/socp/second_order_cone_kernels.cu b/cpp/tests/socp/second_order_cone_kernels.cu new file mode 100644 index 0000000000..49126ee335 --- /dev/null +++ b/cpp/tests/socp/second_order_cone_kernels.cu @@ -0,0 +1,584 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex::test { + +TEST(second_order_cone_kernels, topology_and_scratch_layout) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 2, 5}; + rmm::device_uvector x(10, stream); + rmm::device_uvector z(10, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + EXPECT_EQ(cones.n_cones, std::size_t{3}); + EXPECT_EQ(cones.n_cone_entries, std::size_t{10}); + EXPECT_EQ(cones.x.data(), x.data()); + EXPECT_EQ(cones.z.data(), z.data()); + + EXPECT_EQ(cuopt::host_copy(cones.cone_offsets, stream), (std::vector{0, 3, 5, 10})); + EXPECT_EQ(cuopt::host_copy(cones.cone_dimensions, stream), cone_dimensions); + EXPECT_EQ(cuopt::host_copy(cones.element_cone_ids, stream), + (std::vector{0, 0, 0, 1, 1, 2, 2, 2, 2, 2})); + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.small_cone_ids, stream), + (std::vector{0, 1, 2})); + EXPECT_TRUE(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream).empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_ids.empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_offsets.empty()); + EXPECT_TRUE(cones.segmented_sum.large_cone_dimensions.empty()); + + EXPECT_EQ(cones.eta.size(), 3); + EXPECT_EQ(cones.w.size(), 10); + + auto& scratch = cones.scratch; + EXPECT_EQ(scratch.n_cones, cones.n_cones); + EXPECT_EQ(scratch.n_cone_entries, cones.n_cone_entries); + EXPECT_EQ(scratch.slots.size(), 3 * cone_dimensions.size()); + EXPECT_EQ(scratch.step_alpha_primal.size(), cone_dimensions.size()); + EXPECT_EQ(scratch.step_alpha_dual.size(), cone_dimensions.size()); + EXPECT_EQ(scratch.temp_cone.size(), x.size()); + + EXPECT_EQ(scratch.get_slot<0>().size(), cone_dimensions.size()); + EXPECT_EQ(scratch.get_slot<1>().data(), scratch.get_slot<0>().data() + cones.n_cones); + EXPECT_EQ(scratch.get_slot<2>().data(), scratch.get_slot<1>().data() + cones.n_cones); +} + +TEST(second_order_cone_kernels, segmented_sum_uses_all_cone_size_buckets) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{65, 3, 66, 32769}; + rmm::device_uvector x(32903, stream); + rmm::device_uvector z(32903, stream); + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.small_cone_ids, stream), (std::vector{1})); + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream), + (std::vector{0, 2})); + EXPECT_EQ(cones.segmented_sum.large_cone_ids, (std::vector{3})); + EXPECT_EQ(cones.segmented_sum.large_cone_offsets, (std::vector{134})); + EXPECT_EQ(cones.segmented_sum.large_cone_dimensions, (std::vector{32769})); + + std::vector values_host(cones.n_cone_entries, 1.0); + rmm::device_uvector values(values_host.size(), stream); + rmm::device_uvector sums(cone_dimensions.size(), stream); + raft::copy(values.data(), values_host.data(), values_host.size(), stream); + + EXPECT_GT(cones.segmented_sum.cub_workspace_bytes, 0); + const auto workspace_size = cones.segmented_sum.cub_workspace.size(); + EXPECT_GT(workspace_size, 0); + + cones.segmented_sum(values.data(), cuopt::make_span(sums), stream); + + EXPECT_EQ(cuopt::host_copy(sums, stream), (std::vector{65.0, 3.0, 66.0, 32769.0})); + EXPECT_EQ(cones.segmented_sum.cub_workspace.size(), workspace_size); +} + +TEST(second_order_cone_kernels, nt_scaling_matches_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 100.0 + static_cast(cone); + z_host[offset] = 80.0 + static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + x_host[offset + local_idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[offset + local_idx] = 0.0015 * static_cast((local_idx % 7) + 1); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + const auto workspace_size = cones.segmented_sum.cub_workspace.size(); + EXPECT_GT(workspace_size, 0); + + launch_nt_scaling(cones, stream); + EXPECT_EQ(cones.segmented_sum.cub_workspace.size(), workspace_size); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + + std::vector expected_eta(cone_dimensions.size()); + std::vector expected_w(n_cone_entries); + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double x_tail_sq = 0.0; + double z_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_tail_sq += x_host[idx] * x_host[idx]; + z_tail_sq += z_host[idx] * z_host[idx]; + } + + const auto x_tail_norm = std::sqrt(x_tail_sq); + const auto z_tail_norm = std::sqrt(z_tail_sq); + const auto x_det = (x_host[offset] - x_tail_norm) * (x_host[offset] + x_tail_norm); + const auto z_det = (z_host[offset] - z_tail_norm) * (z_host[offset] + z_tail_norm); + ASSERT_GT(x_det, 0.0) << "cone " << cone; + ASSERT_GT(z_det, 0.0) << "cone " << cone; + + const auto x_scale = std::sqrt(x_det); + const auto z_scale = std::sqrt(z_det); + + expected_eta[cone] = std::sqrt(z_scale / x_scale); + + double normalized_xz_dot = 0.0; + for (int local_idx = 0; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_xz_dot += x_host[idx] * z_host[idx] / (x_scale * z_scale); + } + const auto w_det = 2.0 + 2.0 * normalized_xz_dot; + ASSERT_GT(w_det, 0.0) << "cone " << cone; + const auto w_scale = std::sqrt(w_det); + + expected_w[offset] = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + expected_w[idx] = (z_host[idx] / z_scale - x_host[idx] / x_scale) / w_scale; + } + + double normalized_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_tail_sq += expected_w[idx] * expected_w[idx]; + } + expected_w[offset] = std::sqrt(1.0 + normalized_tail_sq); + + offset += static_cast(dim); + } + + for (std::size_t i = 0; i < expected_eta.size(); ++i) { + EXPECT_NEAR(eta_host[i], expected_eta[i], 1e-10) << "cone " << i; + } + + for (std::size_t i = 0; i < expected_w.size(); ++i) { + EXPECT_NEAR(w_host[i], expected_w[i], 1e-10) << "entry " << i; + } + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + tail_sq += w_host[idx] * w_host[idx]; + } + + EXPECT_NEAR(w_host[offset] * w_host[offset] - tail_sq, 1.0, 1e-10) << "cone " << cone; + offset += static_cast(dim); + } +} + +TEST(second_order_cone_kernels, cone_step_length_keeps_iterate_in_cone) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector dx_host(n_cone_entries); + std::vector dz_host(n_cone_entries); + + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + x_host[offset] = 12.0 + static_cast(cone); + z_host[offset] = 14.0 + static_cast(cone); + dx_host[offset] = (cone == 0) ? -30.0 : 0.2; + dz_host[offset] = (cone == 1) ? -25.0 : 0.15; + + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + dx_host[idx] = 0.02 * static_cast((local_idx % 5) - 2); + dz_host[idx] = -0.015 * static_cast((local_idx % 7) - 3); + } + + offset += static_cast(dim); + } + + constexpr double alpha_max = 0.99; + constexpr double cone_tol = 1e-8; + + const auto cone_block_offset = [&](std::size_t cone_idx) { + std::size_t off = 0; + for (std::size_t c = 0; c < cone_idx; ++c) { + off += static_cast(cone_dimensions[c]); + } + return off; + }; + + const auto expect_cone_feasible_after_step = [&](std::vector const& u, + std::vector const& du, + double alpha, + std::size_t cone_idx, + const char* label) { + const std::size_t off = cone_block_offset(cone_idx); + const auto dim = cone_dimensions[cone_idx]; + const double u0 = u[off] + alpha * du[off]; + + double tail_sq = 0.0; + for (int j = 1; j < dim; ++j) { + const double tail = u[off + j] + alpha * du[off + j]; + tail_sq += tail * tail; + } + + EXPECT_GE(u0, -cone_tol) << label << " cone " << cone_idx; + EXPECT_GE(u0 * u0 + cone_tol, tail_sq) << label << " cone " << cone_idx; + }; + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto dx = cuopt::device_copy(dx_host, stream); + auto dz = cuopt::device_copy(dz_host, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + const auto [step_primal, step_dual] = + compute_cone_step_length(cones, + raft::device_span(dx.data(), dx.size()), + raft::device_span(dz.data(), dz.size()), + alpha_max, + stream); + + const auto primal_per_cone = cuopt::host_copy(cones.scratch.step_alpha_primal, stream); + const auto dual_per_cone = cuopt::host_copy(cones.scratch.step_alpha_dual, stream); + EXPECT_NEAR( + step_primal, *std::min_element(primal_per_cone.begin(), primal_per_cone.end()), 1e-12); + EXPECT_NEAR(step_dual, *std::min_element(dual_per_cone.begin(), dual_per_cone.end()), 1e-12); + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + EXPECT_GT(primal_per_cone[cone], 0.0) << "primal cone " << cone; + EXPECT_GT(dual_per_cone[cone], 0.0) << "dual cone " << cone; + expect_cone_feasible_after_step(x_host, dx_host, primal_per_cone[cone], cone, "primal"); + expect_cone_feasible_after_step(z_host, dz_host, dual_per_cone[cone], cone, "dual"); + } +} + +TEST(second_order_cone_kernels, scaling_operators_match_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector v_host(n_cone_entries); + std::vector cone_target_host(n_cone_entries); + std::vector accum_initial_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 100.0 + static_cast(cone); + z_host[offset] = 80.0 + static_cast(cone); + v_host[offset] = 0.75 + 0.1 * static_cast(cone); + cone_target_host[offset] = 0.4 + 0.03 * static_cast(cone); + accum_initial_host[offset] = -0.2 + 0.02 * static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + v_host[idx] = 0.002 * static_cast((local_idx % 11) - 5); + cone_target_host[idx] = 0.003 * static_cast((local_idx % 13) - 6); + accum_initial_host[idx] = 0.004 * static_cast((local_idx % 17) - 8); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto v = cuopt::device_copy(v_host, stream); + auto cone_target = cuopt::device_copy(cone_target_host, stream); + auto accum = cuopt::device_copy(accum_initial_host, stream); + rmm::device_uvector w_out(n_cone_entries, stream); + rmm::device_uvector w_inv_out(n_cone_entries, stream); + rmm::device_uvector h_out(n_cone_entries, stream); + rmm::device_uvector w_tmp(n_cone_entries, stream); + // apply_w then apply_w on same v: should match apply_hessian (H = W^2 for symmetric NT W). + rmm::device_uvector w_squared_v(n_cone_entries, stream); + rmm::device_uvector recovered_dz(n_cone_entries, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + launch_nt_scaling(cones, stream); + + auto v_span = raft::device_span(v.data(), v.size()); + apply_w(v_span, cuopt::make_span(w_out), cones, stream); + apply_w_inv(v_span, cuopt::make_span(w_inv_out), cones, stream); + apply_hessian(v_span, cuopt::make_span(h_out), cones, stream); + recover_cone_dz_from_target( + v_span, + cones, + raft::device_span(cone_target.data(), cone_target.size()), + cuopt::make_span(recovered_dz), + stream); + accumulate_cone_hessian_matvec(v_span, cones, cuopt::make_span(accum), stream); + apply_w(v_span, cuopt::make_span(w_tmp), cones, stream); + apply_w(raft::device_span(w_tmp.data(), w_tmp.size()), + cuopt::make_span(w_squared_v), + cones, + stream); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto w_out_host = cuopt::host_copy(w_out, stream); + auto w_inv_out_host = cuopt::host_copy(w_inv_out, stream); + auto h_out_host = cuopt::host_copy(h_out, stream); + auto w_squared_v_host = cuopt::host_copy(w_squared_v, stream); + auto recovered_dz_host = cuopt::host_copy(recovered_dz, stream); + auto accum_host = cuopt::host_copy(accum, stream); + + std::vector expected_w(n_cone_entries); + std::vector expected_w_inv(n_cone_entries); + std::vector expected_h(n_cone_entries); + std::vector expected_h_unscaled(n_cone_entries); + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[offset]; + const auto v0 = v_host[offset]; + const auto eta = eta_host[cone]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + tail_dot += w_host[idx] * v_host[idx]; + } + + expected_w[offset] = eta * (w0 * v0 + tail_dot); + expected_w_inv[offset] = (w0 * v0 - tail_dot) / eta; + + const auto rho = w0 * v0 + tail_dot; + expected_h_unscaled[offset] = (eta * eta) * (2.0 * w0 * rho - v0); + expected_h[offset] = expected_h_unscaled[offset]; + + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + + expected_w[idx] = eta * (v_host[idx] + (v0 + tail_dot / (1.0 + w0)) * w_host[idx]); + expected_w_inv[idx] = (v_host[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta; + expected_h_unscaled[idx] = (eta * eta) * (2.0 * w_host[idx] * rho + v_host[idx]); + expected_h[idx] = expected_h_unscaled[idx]; + } + + offset += static_cast(dim); + } + + for (std::size_t i = 0; i < n_cone_entries; ++i) { + EXPECT_NEAR(w_out_host[i], expected_w[i], 1e-9) << "W entry " << i; + EXPECT_NEAR(w_inv_out_host[i], expected_w_inv[i], 1e-9) << "W inverse entry " << i; + EXPECT_NEAR(h_out_host[i], expected_h[i], 1e-9) << "H entry " << i; + EXPECT_NEAR(h_out_host[i], w_squared_v_host[i], 1e-9) << "W^2 v vs H v entry " << i; + EXPECT_NEAR(recovered_dz_host[i], cone_target_host[i] - expected_h_unscaled[i], 1e-9) + << "recovered dz entry " << i; + EXPECT_NEAR(accum_host[i], accum_initial_host[i] + expected_h_unscaled[i], 1e-9) + << "accumulated H entry " << i; + } +} + +TEST(second_order_cone_kernels, combined_cone_rhs_matches_host_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 65, 32769}; + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector dx_aff_host(n_cone_entries); + std::vector dz_aff_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 120.0 + static_cast(cone); + z_host[offset] = 90.0 + static_cast(cone); + dx_aff_host[offset] = 0.25 + 0.05 * static_cast(cone); + dz_aff_host[offset] = -0.3 + 0.04 * static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + dx_aff_host[idx] = 0.002 * static_cast((local_idx % 11) - 5); + dz_aff_host[idx] = 0.001 * static_cast((local_idx % 13) - 6); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto dx_aff = cuopt::device_copy(dx_aff_host, stream); + auto dz_aff = cuopt::device_copy(dz_aff_host, stream); + rmm::device_uvector out(n_cone_entries, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + launch_nt_scaling(cones, stream); + + constexpr double sigma_mu = 0.37; + compute_combined_cone_rhs_term(raft::device_span(dx_aff.data(), dx_aff.size()), + raft::device_span(dz_aff.data(), dz_aff.size()), + cones, + sigma_mu, + cuopt::make_span(out), + stream); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto out_host = cuopt::host_copy(out, stream); + + auto apply_w_ref = [&](std::vector const& v) { + std::vector result(n_cone_entries); + std::size_t off = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[off]; + const auto v0 = v[off]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + tail_dot += w_host[idx] * v[idx]; + } + + result[off] = eta_host[cone] * (w0 * v0 + tail_dot); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + result[idx] = eta_host[cone] * (v[idx] + (v0 + tail_dot / (1.0 + w0)) * w_host[idx]); + } + + off += static_cast(dim); + } + return result; + }; + + auto apply_w_inv_ref = [&](std::vector const& v) { + std::vector result(n_cone_entries); + std::size_t off = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[off]; + const auto v0 = v[off]; + + double tail_dot = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + tail_dot += w_host[idx] * v[idx]; + } + + result[off] = (w0 * v0 - tail_dot) / eta_host[cone]; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = off + local_idx; + result[idx] = (v[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta_host[cone]; + } + + off += static_cast(dim); + } + return result; + }; + + // Same order as compute_combined_cone_rhs_term: apply_w(dx_aff), apply_w_inv(dz_aff). + auto scaled_dx = apply_w_ref(dx_aff_host); + auto scaled_dz = apply_w_inv_ref(dz_aff_host); + auto nt_point = apply_w_inv_ref(z_host); + + std::vector shift(n_cone_entries); + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double head_dot = 0.0; + for (int local_idx = 0; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + head_dot += scaled_dx[idx] * scaled_dz[idx]; + } + + shift[offset] = head_dot - sigma_mu; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + shift[idx] = scaled_dx[offset] * scaled_dz[idx] + scaled_dz[offset] * scaled_dx[idx]; + } + + offset += static_cast(dim); + } + + std::vector minus_p(n_cone_entries); + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + const auto lambda0 = nt_point[offset]; + + double lambda_tail_dot = 0.0; + double lambda_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + lambda_tail_dot += nt_point[idx] * shift[idx]; + lambda_tail_sq += nt_point[idx] * nt_point[idx]; + } + + const auto lambda_tail_norm = std::sqrt(lambda_tail_sq); + const auto det_lambda = (lambda0 - lambda_tail_norm) * (lambda0 + lambda_tail_norm); + ASSERT_GT(lambda0, 0.0) << "cone " << cone; + ASSERT_GT(det_lambda, 0.0) << "cone " << cone; + + const auto p_head = (lambda0 * shift[offset] - lambda_tail_dot) / det_lambda; + minus_p[offset] = -p_head; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + minus_p[idx] = (p_head * nt_point[idx] - shift[idx]) / lambda0; + } + + offset += static_cast(dim); + } + + auto expected = apply_w_ref(minus_p); + for (std::size_t i = 0; i < n_cone_entries; ++i) { + EXPECT_NEAR(out_host[i], expected[i], 1e-8) << "entry " << i; + } +} + +} // namespace cuopt::linear_programming::dual_simplex::test diff --git a/cpp/tests/socp/solve_barrier_socp.cu b/cpp/tests/socp/solve_barrier_socp.cu new file mode 100644 index 0000000000..f960c3dc3a --- /dev/null +++ b/cpp/tests/socp/solve_barrier_socp.cu @@ -0,0 +1,827 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cuopt::linear_programming::dual_simplex::test { + +// This serves as both a warm up but also a mandatory initial call to setup cuSparse and cuBLAS +static void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +TEST(barrier, cone_metadata_reindexed_when_slack_is_inserted_before_cones) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 5; + constexpr int nz = 5; + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective.assign(n, 0.0); + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start.resize(n + 1); + for (int j = 0; j < n; ++j) { + user_problem.A.col_start[j] = j; + user_problem.A.i[j] = 0; + user_problem.A.x[j] = 1.0; + } + user_problem.A.col_start[n] = nz; + user_problem.rhs = {1.0}; + user_problem.row_sense = {'L'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.second_order_cone_dims = {2, 2}; + user_problem.cone_var_start = 1; + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + ASSERT_EQ(new_slacks.size(), 1); + EXPECT_EQ(new_slacks[0], 1); + EXPECT_EQ(original_lp.num_cols, 6); + EXPECT_EQ(original_lp.second_order_cone_dims, user_problem.second_order_cone_dims); + EXPECT_EQ(original_lp.cone_var_start, 2); + + lp_problem_t barrier_lp(user_problem.handle_ptr, + original_lp.num_rows, + original_lp.num_cols, + original_lp.A.col_start[original_lp.num_cols]); + std::vector column_scales; + std::vector row_scales; + scaling(original_lp, settings, barrier_lp, column_scales, row_scales); + + EXPECT_EQ(barrier_lp.second_order_cone_dims, user_problem.second_order_cone_dims); + EXPECT_EQ(barrier_lp.cone_var_start, 2); +} + +TEST(barrier, presolve_reindexes_cone_start_after_empty_column_removal) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 4; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 0, 1, 2, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = 0.5; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + presolve_info_t presolve_info; + lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); + ASSERT_EQ(presolve(original_lp, settings, presolved_lp, presolve_info), 0); + + EXPECT_EQ(presolved_lp.num_cols, 3); + EXPECT_EQ(presolved_lp.second_order_cone_dims, std::vector({3})); + EXPECT_EQ(presolved_lp.cone_var_start, 0); + + lp_problem_t barrier_lp(user_problem.handle_ptr, + presolved_lp.num_rows, + presolved_lp.num_cols, + presolved_lp.A.col_start[presolved_lp.num_cols]); + std::vector column_scales; + std::vector row_scales; + ASSERT_EQ(scaling(presolved_lp, settings, barrier_lp, column_scales, row_scales), 0); + EXPECT_EQ(barrier_lp.cone_var_start, 0); +} + +TEST(barrier, presolve_keeps_direct_free_variables_before_cones) +{ + // Layout: [x0, x1 | cone x2, x3, x4] with x0, x1 free and a 3-dimensional SOC block. + // SOCP barrier presolve keeps direct free variables (no x = v - w split); cone_var_start + // and column count stay unchanged. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 5; + constexpr int nz = 5; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 1, 2, 3, 4, 5}; + for (int j = 0; j < n; ++j) { + user_problem.A.i[j] = 0; + user_problem.A.x[j] = 1.0; + } + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + user_problem.lower = {-inf, -inf, 0.0, 0.0, 0.0}; + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = false; + + std::vector new_slacks; + dualize_info_t dualize_info; + lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); + convert_user_problem(user_problem, settings, original_lp, new_slacks, dualize_info); + + presolve_info_t presolve_info; + lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); + ASSERT_EQ(presolve(original_lp, settings, presolved_lp, presolve_info), 0); + + EXPECT_EQ(presolved_lp.num_cols, 5); + EXPECT_EQ(presolved_lp.cone_var_start, 2); + EXPECT_EQ(presolved_lp.second_order_cone_dims, std::vector({3})); + EXPECT_TRUE(presolve_info.free_variable_pairs.empty()); + ASSERT_EQ(presolve_info.direct_free_variables.size(), 2); + EXPECT_EQ(presolve_info.direct_free_variables[0], 0); + EXPECT_EQ(presolve_info.direct_free_variables[1], 1); + EXPECT_EQ(presolved_lp.lower[0], -inf); + EXPECT_EQ(presolved_lp.lower[1], -inf); + EXPECT_EQ(presolved_lp.upper[0], inf); + EXPECT_EQ(presolved_lp.upper[1], inf); +} + +TEST(barrier, rejects_middle_cone_input_before_barrier) +{ + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 3; + constexpr int n = 5; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0, 1.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 1, 1, 2, 2, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 2; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {2.0, 1.0, 3.0}; + user_problem.row_sense = {'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + user_problem.num_range_rows = 0; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.dualize = 0; + lp_solution_t solution(m, n); + + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + EXPECT_EQ(status, lp_status_t::NUMERICAL_ISSUES); +} + +TEST(barrier, socp_min_x0_subject_to_norm_constraint) +{ + // minimize x_0 + // subject to x_1 = 1 + // (x_0, x_1, x_2) in Q^3 + // + // Optimal: x* = (1, 1, 0), obj* = 1 + + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 3; + constexpr int nz = 1; + + user_problem.num_rows = m; + user_problem.num_cols = n; + + user_problem.objective = {1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 0, 1, 1}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + + user_problem.lower = {0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "socp_norm_cone"; + + user_problem.cone_var_start = 0; + user_problem.second_order_cone_dims = {3}; + + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[2]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize l + // subject to l - t = 0 + // u = 1 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {1.0, 0.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 2, 3, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {0.0, 1.0}; + user_problem.row_sense = {'E', 'E'}; + + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_block"; + + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_soc_tail_coupling) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize t + // subject to l - u = 0 + // l + u = 2 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 2, 2, 4, 4}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = -1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + + user_problem.rhs = {0.0, 2.0}; + user_problem.row_sense = {'E', 'E'}; + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_tail_coupling"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_soc_tail_coupling_with_inequality) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize t + // subject to l - u = 0 + // l + u >= 2 + // (t, u, v) in Q^3 + // + // Optimal: l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 4; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 2, 2, 4, 4}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 1; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 0; + user_problem.A.x[2] = -1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + + user_problem.rhs = {0.0, 2.0}; + user_problem.row_sense = {'E', 'G'}; + user_problem.lower = {0.0, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_soc_tail_coupling_with_inequality"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_two_soc_blocks) +{ + // Variables ordered as [l1, l2 | t1, u1, v1 | t2, u2, v2], + // where (t1, u1, v1), (t2, u2, v2) \in Q^3. + // + // minimize t1 + t2 + // subject to l1 - u1 = 0 + // l2 - u2 = 0 + // l1 + l2 = 3 + // l1 - l2 = 1 + // + // Optimal: l1* = 2, l2* = 1, t1* = 2, u1* = 2, v1* = 0, + // t2* = 1, u2* = 1, v2* = 0, obj* = 3. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 8; + constexpr int nz = 8; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l1, l2, t1, u1, v1, t2, u2, v2 + user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 3; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + user_problem.A.i[4] = 2; + user_problem.A.x[4] = 1.0; + user_problem.A.i[5] = 3; + user_problem.A.x[5] = -1.0; + user_problem.A.i[6] = 0; + user_problem.A.x[6] = -1.0; + user_problem.A.i[7] = 1; + user_problem.A.x[7] = -1.0; + + user_problem.rhs = {0.0, 0.0, 3.0, 1.0}; + user_problem.row_sense = {'E', 'E', 'E', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_two_soc_blocks"; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3, 3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 3.0, 1e-4); + EXPECT_NEAR(solution.x[0], 2.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 2.0, 1e-4); + EXPECT_NEAR(solution.x[3], 2.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4); + EXPECT_NEAR(solution.x[5], 1.0, 1e-4); + EXPECT_NEAR(solution.x[6], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[7]), 0.0, 1e-4); +} + +TEST(barrier, mixed_linear_and_two_soc_blocks_with_inequality) +{ + // Variables ordered as [l1, l2 | t1, u1, v1 | t2, u2, v2], + // where (t1, u1, v1), (t2, u2, v2) \in Q^3. + // + // minimize t1 + t2 + // subject to l1 - u1 = 0 + // l2 - u2 = 0 + // l1 + l2 >= 3 + // l1 - l2 = 1 + // + // Optimal: l1* = 2, l2* = 1, t1* = 2, u1* = 2, v1* = 0, + // t2* = 1, u2* = 1, v2* = 0, obj* = 3. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 8; + constexpr int nz = 8; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l1, l2, t1, u1, v1, t2, u2, v2 + user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 3; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + user_problem.A.i[4] = 2; + user_problem.A.x[4] = 1.0; + user_problem.A.i[5] = 3; + user_problem.A.x[5] = -1.0; + user_problem.A.i[6] = 0; + user_problem.A.x[6] = -1.0; + user_problem.A.i[7] = 1; + user_problem.A.x[7] = -1.0; + + user_problem.rhs = {0.0, 0.0, 3.0, 1.0}; + user_problem.row_sense = {'E', 'E', 'G', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_linear_and_two_soc_blocks_with_inequality"; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3, 3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 3.0, 1e-4); + EXPECT_NEAR(solution.x[0], 2.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 2.0, 1e-4); + EXPECT_NEAR(solution.x[3], 2.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4); + EXPECT_NEAR(solution.x[5], 1.0, 1e-4); + EXPECT_NEAR(solution.x[6], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[7]), 0.0, 1e-4); +} + +TEST(barrier, free_linear_prefix_is_uncrushed_correctly_with_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3 and l is free. + // + // minimize t + // subject to l - u = 0 + // u = 1 + // (t, u, v) in Q^3 + // + // Direct free variable l is kept through presolve; end-to-end solve returns + // l* = 1, t* = 1, u* = 1, v* = 0, obj* = 1. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 2; + constexpr int n = 4; + constexpr int nz = 3; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 1, 3, 3}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = -1.0; + user_problem.A.i[2] = 1; + user_problem.A.x[2] = 1.0; + + user_problem.rhs = {0.0, 1.0}; + user_problem.row_sense = {'E', 'E'}; + user_problem.lower = {-inf, 0.0, 0.0, 0.0}; + user_problem.upper = {inf, inf, inf, inf}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "free_linear_prefix_is_uncrushed_correctly_with_soc_block"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +TEST(barrier, qp_with_soc_block) +{ + // Variables ordered as [l | t, u, v], where (t, u, v) \in Q^3. + // + // minimize 0.5 l^2 + t + // subject to l + u = 2 + // (t, u, v) in Q^3 + // + // Since t >= |u| and u = 2 - l with l >= 0, the objective becomes + // 0.5 l^2 + |2 - l|, which is minimized at l* = 1, u* = 1, t* = 1, v* = 0. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 4; + constexpr int nz = 2; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = {0.0, 1.0, 0.0, 0.0}; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l, t, u, v + user_problem.A.col_start = {0, 1, 1, 2, 2}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 0; + user_problem.A.x[1] = 1.0; + + user_problem.rhs = {2.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.Q_offsets = {0, 1, 1, 1, 1}; + user_problem.Q_indices = {0}; + user_problem.Q_values = {1.0}; + + user_problem.num_range_rows = 0; + user_problem.problem_name = "qp_with_soc_block"; + user_problem.cone_var_start = 1; + user_problem.second_order_cone_dims = {3}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.5, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 1.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); +} + +} // namespace cuopt::linear_programming::dual_simplex::test diff --git a/python/cuopt/cuopt/linear_programming/data_model/data_model.pxd b/python/cuopt/cuopt/linear_programming/data_model/data_model.pxd index 4a83f3a058..761fbb5dd6 100644 --- a/python/cuopt/cuopt/linear_programming/data_model/data_model.pxd +++ b/python/cuopt/cuopt/linear_programming/data_model/data_model.pxd @@ -7,11 +7,27 @@ # cython: embedsignature = True # cython: language_level = 3 +from libc.stddef cimport size_t from libcpp cimport bool from libcpp.string cimport string from libcpp.vector cimport vector +cdef extern from "cuopt/linear_programming/io/mps_data_model.hpp" namespace "cuopt::linear_programming::io" nogil: # noqa + + cdef cppclass mps_data_model_t[i_t, f_t]: + cppclass quadratic_constraint_t: + int constraint_row_index + string constraint_row_name + char constraint_row_type + vector[double] linear_values + vector[int] linear_indices + double rhs_value + vector[int] rows + vector[int] cols + vector[double] vals + + cdef extern from "cuopt/linear_programming/io/data_model_view.hpp" namespace "cuopt::linear_programming::io" nogil: # noqa cdef cppclass data_model_view_t[i_t, f_t]: @@ -54,6 +70,8 @@ cdef extern from "cuopt/linear_programming/io/data_model_view.hpp" namespace "cu void set_row_names(const vector[string] row_names) except + void set_problem_name(const string problem_name) except + void set_objective_name(const string objective_name) except + + void set_quadratic_constraints( + vector[mps_data_model_t[i_t, f_t].quadratic_constraint_t] constraints) except + cdef extern from "cuopt/linear_programming/io/writer.hpp" namespace "cuopt::linear_programming::io" nogil: # noqa diff --git a/python/cuopt/cuopt/linear_programming/data_model/data_model.py b/python/cuopt/cuopt/linear_programming/data_model/data_model.py index 648809eac1..7bcdfaea9b 100644 --- a/python/cuopt/cuopt/linear_programming/data_model/data_model.py +++ b/python/cuopt/cuopt/linear_programming/data_model/data_model.py @@ -4,6 +4,7 @@ import os import time + from . import data_model_wrapper from .utilities import catch_cuopt_exception @@ -288,6 +289,81 @@ def set_quadratic_objective_matrix(self, Q_values, Q_indices, Q_offsets): """ super().set_quadratic_objective_matrix(Q_values, Q_indices, Q_offsets) + def get_quadratic_constraints(self): + """ + Return quadratic constraints appended to this model. + + Each entry is a dict with keys including ``constraint_row_index``, + ``constraint_row_name``, ``constraint_row_type``, COO arrays, and ``rhs_value``. + """ + return self.quadratic_constraints + + @catch_cuopt_exception + def clear_quadratic_constraints(self): + """ + Remove all quadratic constraints from the model. + """ + super().clear_quadratic_constraints() + + @catch_cuopt_exception + def add_quadratic_constraint( + self, + constraint_row_name="", + linear_values=None, + linear_indices=None, + rhs_value=0.0, + vals=None, + rows=None, + cols=None, + sense="L", + ): + """ + Add a quadratic constraint. + + Each constraint has a linear part (optional) and a quadratic part in COO + format. Call multiple times to add several quadratic constraints. + + Parameters + ---------- + constraint_row_name : str, optional + Optional row name. + linear_values, linear_indices : array-like, optional + Sparse linear coefficients on the same variable index space. + rhs_value : float, optional + Right-hand side of the constraint. + vals, rows, cols : array-like + COO triplets for the quadratic matrix Q in + ``linear^T x + x^T Q x {sense} rhs_value``. + sense : str, optional + Constraint sense: ``'L'`` (default, ``<=``) or ``'G'`` (``>=``). + ``'G'`` constraints are converted to ``'L'`` internally. + Equality (``'E'``) is not supported. + + Notes + ----- + When any quadratic constraint is present, cuOpt selects the barrier + solver and converts quadratic constraints to second-order cones. + """ + if hasattr(sense, "value"): + sense = sense.value + if sense == "E": + raise ValueError("Equality constraints are not supported.") + if sense not in ("L", "G"): + raise ValueError( + f"Invalid sense {sense!r}; use 'L' or 'G' like set_row_types " + "(equality 'E' is not supported)." + ) + super().add_quadratic_constraint( + constraint_row_name, + linear_values, + linear_indices, + rhs_value, + vals, + rows, + cols, + constraint_row_type=sense, + ) + @catch_cuopt_exception def set_variable_lower_bounds(self, variable_lower_bounds): """ diff --git a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pxd b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pxd index 6c798a0f6a..6c401b59f5 100644 --- a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pxd +++ b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa # SPDX-License-Identifier: Apache-2.0 from .data_model cimport * @@ -12,3 +12,7 @@ from libcpp.memory cimport unique_ptr cdef class DataModel: cdef unique_ptr[data_model_view_t[int, double]] c_data_model_view + + cdef void _set_cpp_quadratic_constraints( + self, data_model_view_t[int, double]* c_data_model_view + ) diff --git a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx index 7722fb2437..ec8bdf3730 100644 --- a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx +++ b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx @@ -7,7 +7,7 @@ # cython: embedsignature = True # cython: language_level = 3 -from .data_model cimport data_model_view_t, write_mps +from .data_model cimport data_model_view_t, mps_data_model_t, write_mps import warnings @@ -17,10 +17,14 @@ from cuopt.utilities import get_data_ptr from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from libcpp.utility cimport move +from libcpp.vector cimport vector def type_cast(np_obj, np_type, name): + if not isinstance(np_obj, np.ndarray): + np_obj = np.asarray(np_obj) obj_type = np_obj.dtype if ((np.issubdtype(np_type, np.floating) and @@ -62,6 +66,83 @@ cdef class DataModel: self.variable_types = np.array([]) self.variable_names = np.array([]) self.row_names = np.array([]) + self.quadratic_constraints = [] + + def clear_quadratic_constraints(self): + self.quadratic_constraints = [] + + def _get_n_linear_constraints(self): + if self.b.shape[0] != 0: + return self.b.shape[0] + if self.A_offsets.shape[0] > 0: + return self.A_offsets.shape[0] - 1 + if self.host_row_types.shape[0] != 0: + return self.host_row_types.shape[0] + return 0 + + def add_quadratic_constraint( + self, + constraint_row_name="", + linear_values=None, + linear_indices=None, + rhs_value=0.0, + vals=None, + rows=None, + cols=None, + constraint_row_type="L", + ): + linear_values = ( + np.array([], dtype=np.float64) + if linear_values is None + else type_cast(linear_values, np.float64, "linear_values") + ) + linear_indices = ( + np.array([], dtype=np.int32) + if linear_indices is None + else type_cast(linear_indices, np.int32, "linear_indices") + ) + if linear_values.shape[0] != linear_indices.shape[0]: + raise ValueError("linear_values and linear_indices must have the same length") + vals = ( + np.array([], dtype=np.float64) + if vals is None + else type_cast(vals, np.float64, "vals") + ) + rows = ( + np.array([], dtype=np.int32) + if rows is None + else type_cast(rows, np.int32, "rows") + ) + cols = ( + np.array([], dtype=np.int32) + if cols is None + else type_cast(cols, np.int32, "cols") + ) + if not (vals.shape[0] == rows.shape[0] == cols.shape[0]): + raise ValueError("vals, rows, and cols must have the same length") + row_type = str(constraint_row_type) + if row_type == "E": + raise ValueError("Equality constraints are not supported.") + if row_type not in ("L", "G"): + raise ValueError( + f"Invalid constraint_row_type {row_type!r}; use 'L' or 'G' like set_row_types." + ) + constraint_row_index = ( + self._get_n_linear_constraints() + len(self.quadratic_constraints) + ) + self.quadratic_constraints.append( + { + "constraint_row_index": int(constraint_row_index), + "constraint_row_name": str(constraint_row_name), + "constraint_row_type": row_type, + "linear_values": linear_values, + "linear_indices": linear_indices, + "rhs_value": float(rhs_value), + "vals": vals, + "rows": rows, + "cols": cols, + } + ) def set_maximize(self, maximize): self.maximize = maximize @@ -379,10 +460,65 @@ cdef class DataModel: self.get_initial_dual_solution().shape[0] ) + if self.quadratic_constraints: + self._set_cpp_quadratic_constraints(c_data_model_view) + + cdef void _set_cpp_quadratic_constraints( + self, data_model_view_t[int, double]* c_data_model_view + ): + cdef vector[mps_data_model_t[int, double].quadratic_constraint_t] constraints + cdef mps_data_model_t[int, double].quadratic_constraint_t qc + cdef dict item + cdef size_t i + cdef uintptr_t c_linear_values + cdef uintptr_t c_linear_indices + cdef uintptr_t c_vals + cdef uintptr_t c_rows + cdef uintptr_t c_cols + cdef size_t linear_nnz + cdef size_t quadratic_nnz + + for item in self.quadratic_constraints: + qc.constraint_row_index = item["constraint_row_index"] + qc.constraint_row_name = item["constraint_row_name"].encode("utf-8") + qc.constraint_row_type = ord(item.get("constraint_row_type", "L")) + qc.rhs_value = item["rhs_value"] + + linear_nnz = item["linear_values"].shape[0] + qc.linear_values.resize(linear_nnz) + qc.linear_indices.resize(linear_nnz) + if linear_nnz > 0: + c_linear_values = get_data_ptr(item["linear_values"]) + c_linear_indices = get_data_ptr(item["linear_indices"]) + for i in range(linear_nnz): + qc.linear_values[i] = (c_linear_values)[i] + qc.linear_indices[i] = (c_linear_indices)[i] + + quadratic_nnz = item["vals"].shape[0] + qc.vals.resize(quadratic_nnz) + qc.rows.resize(quadratic_nnz) + qc.cols.resize(quadratic_nnz) + if quadratic_nnz > 0: + c_vals = get_data_ptr(item["vals"]) + c_rows = get_data_ptr(item["rows"]) + c_cols = get_data_ptr(item["cols"]) + for i in range(quadratic_nnz): + qc.vals[i] = (c_vals)[i] + qc.rows[i] = (c_rows)[i] + qc.cols[i] = (c_cols)[i] + + constraints.push_back(qc) + + c_data_model_view.set_quadratic_constraints(constraints) + def writeMPS(self, user_problem_file): - self.variable_types = type_cast( - self.variable_types, "S1", "variable_types" - ) + n_vars = self.get_variable_lower_bounds().shape[0] + if self.variable_types.shape[0] == 0 and n_vars > 0: + self.variable_types = np.array(["C"] * n_vars, dtype="S1") + else: + self.variable_types = type_cast( + self.variable_types, "S1", "variable_types" + ) self.set_data_model_view() write_mps(self.c_data_model_view.get()[0], user_problem_file.encode('utf-8')) diff --git a/python/cuopt/cuopt/linear_programming/io/parser.pxd b/python/cuopt/cuopt/linear_programming/io/parser.pxd index d1cc95e9d6..947c6cb2a0 100644 --- a/python/cuopt/cuopt/linear_programming/io/parser.pxd +++ b/python/cuopt/cuopt/linear_programming/io/parser.pxd @@ -15,6 +15,17 @@ from libcpp.vector cimport vector cdef extern from "cuopt/linear_programming/io/mps_data_model.hpp" namespace "cuopt::linear_programming::io": # noqa cdef cppclass mps_data_model_t[i_t, f_t]: + cppclass quadratic_constraint_t: + int constraint_row_index + string constraint_row_name + char constraint_row_type + vector[double] linear_values + vector[int] linear_indices + double rhs_value + vector[int] rows + vector[int] cols + vector[double] vals + bool maximize_ vector[f_t] A_ vector[i_t] A_indices_ @@ -36,6 +47,7 @@ cdef extern from "cuopt/linear_programming/io/mps_data_model.hpp" namespace "cuo vector[char] row_types_ string objective_name_ string problem_name_ + const vector[quadratic_constraint_t]& get_quadratic_constraints() const cdef extern from "cuopt/linear_programming/io/utilities/cython_parser.hpp" namespace "cuopt::cython": # noqa diff --git a/python/cuopt/cuopt/linear_programming/io/parser_wrapper.pyx b/python/cuopt/cuopt/linear_programming/io/parser_wrapper.pyx index b2acff89fc..0f7e376001 100644 --- a/python/cuopt/cuopt/linear_programming/io/parser_wrapper.pyx +++ b/python/cuopt/cuopt/linear_programming/io/parser_wrapper.pyx @@ -133,6 +133,52 @@ cdef _marshal_data_model(mps_data_model_t[int, double]* dm, data_model): data_model.set_objective_name(dm.objective_name_.decode()) data_model.set_problem_name(dm.problem_name_.decode()) + cdef size_t qi + cdef size_t n_qc = dm.get_quadratic_constraints().size() + cdef mps_data_model_t[int, double].quadratic_constraint_t qc + cdef size_t linear_nnz, quadratic_nnz + cdef double[:] linear_values_view + cdef int[:] linear_indices_view + cdef double[:] quadratic_values_view + cdef int[:] quadratic_row_indices_view + cdef int[:] quadratic_col_indices_view + + for qi in range(n_qc): + qc = dm.get_quadratic_constraints()[qi] + linear_nnz = qc.linear_values.size() + if linear_nnz > 0: + linear_values_view = qc.linear_values.data() + linear_values = np.asarray(linear_values_view).copy() + linear_indices_view = qc.linear_indices.data() + linear_indices = np.asarray(linear_indices_view).copy() + else: + linear_values = None + linear_indices = None + + quadratic_nnz = qc.vals.size() + if quadratic_nnz > 0: + quadratic_values_view = qc.vals.data() + quadratic_values = np.asarray(quadratic_values_view).copy() + quadratic_row_indices_view = qc.rows.data() + quadratic_row_indices = np.asarray(quadratic_row_indices_view).copy() + quadratic_col_indices_view = qc.cols.data() + quadratic_col_indices = np.asarray(quadratic_col_indices_view).copy() + else: + quadratic_values = None + quadratic_row_indices = None + quadratic_col_indices = None + + data_model.add_quadratic_constraint( + qc.constraint_row_name.decode("utf-8"), + linear_values=linear_values, + linear_indices=linear_indices, + rhs_value=qc.rhs_value, + vals=quadratic_values, + rows=quadratic_row_indices, + cols=quadratic_col_indices, + sense=chr(qc.constraint_row_type), + ) + return data_model diff --git a/python/cuopt/cuopt/linear_programming/problem.py b/python/cuopt/cuopt/linear_programming/problem.py index 5de10410c2..10600a543b 100644 --- a/python/cuopt/cuopt/linear_programming/problem.py +++ b/python/cuopt/cuopt/linear_programming/problem.py @@ -275,8 +275,10 @@ def __mul__(self, other): qvars1 = [self] * len(other.vars) qvars2 = other.vars qcoeffs = other.coefficients - vars = [self] - coeffs = [other.constant] + vars, coeffs = [], [] + if other.constant != 0.0: + vars = [self] + coeffs = [other.constant] return QuadraticExpression( qvars1=qvars1, qvars2=qvars2, @@ -302,6 +304,8 @@ def __le__(self, other): # var1 <= var2 -> var1 - var2 <= 0 expr = self - other return Constraint(expr, LE, 0.0) + case QuadraticExpression(): + return Constraint(self - other, LE, 0.0) case _: raise ValueError("Unsupported operation") @@ -314,6 +318,8 @@ def __ge__(self, other): # var1 >= var2 -> var1 - var2 >= 0 expr = self - other return Constraint(expr, GE, 0.0) + case QuadraticExpression(): + return Constraint(self - other, GE, 0.0) case _: raise ValueError("Unsupported operation") @@ -333,8 +339,9 @@ def __eq__(self, other): class QuadraticExpression: """ QuadraticExpressions contain quadratic terms, linear terms, and a constant. - QuadraticExpressions can be used to create quadratic objectives in - the Problem. + Use them for quadratic objectives (``Problem.setObjective``) or quadratic + constraints via ``<=`` or ``>=`` comparisons passed to + :py:meth:`Problem.addConstraint` (equality is not supported). QuadraticExpressions can be added and subtracted with other QuadraticExpressions, LinearExpressions, and Variables, and can also be multiplied and divided by scalars. @@ -753,6 +760,9 @@ def __rsub__(self, other): # other - self -> other + self * -1.0 return other + self * -1.0 + def __neg__(self): + return self * -1.0 + def __imul__(self, other): # Compute expr *= constant match other: @@ -853,13 +863,91 @@ def __truediv__(self, other): ) def __le__(self, other): - raise Exception("Quadratic constraints not supported") + match other: + case int() | float(): + return Constraint(self, LE, float(other)) + case Variable() | LinearExpression() | QuadraticExpression(): + return Constraint(self - other, LE, 0.0) + case _: + raise ValueError( + "Can't compare QuadraticExpression with type %s" + % type(other).__name__ + ) def __ge__(self, other): - raise Exception("Quadratic constraints not supported") + match other: + case int() | float(): + return Constraint(self, GE, float(other)) + case Variable() | LinearExpression() | QuadraticExpression(): + return Constraint(self - other, GE, 0.0) + case _: + raise ValueError( + "Can't compare QuadraticExpression with type %s" + % type(other).__name__ + ) def __eq__(self, other): - raise Exception("Quadratic constraints not supported") + raise ValueError("Equality constraints are not supported.") + + +def _quadratic_expression_to_qcmatrix(expr, rhs): + """Build QCMATRIX COO data for a quadratic row ``expr`` sense ``rhs``. + + Used for both ``<=`` (``LE``) and ``>=`` (``GE``); row sense is stored on + ``Constraint.Sense``, not in this helper. The constant term is moved to + ``rhs_value``. + + Duplicate linear variable indices and duplicate Q (row, col) triplets are + merged by summing coefficients, matching linear ``Constraint`` behavior. + """ + rhs_value = float(rhs) - expr.constant + + linear_coeff = {} + for var, coeff in zip(expr.vars, expr.coefficients): + if coeff == 0.0: + continue + idx = var.index + linear_coeff[idx] = linear_coeff.get(idx, 0.0) + coeff + + linear_indices = [] + linear_values = [] + for idx in sorted(linear_coeff): + coeff = linear_coeff[idx] + if coeff != 0.0: + linear_indices.append(idx) + linear_values.append(coeff) + + quad_coeff = {} + for var1, var2, coeff in zip(expr.qvars1, expr.qvars2, expr.qcoefficients): + if coeff == 0.0: + continue + key = (var1.index, var2.index) + quad_coeff[key] = quad_coeff.get(key, 0.0) + coeff + if expr.qmatrix is not None: + q_coo = expr.qmatrix.tocoo() + for row, col, value in zip(q_coo.row, q_coo.col, q_coo.data): + if value == 0.0: + continue + key = (expr.qvars[row].index, expr.qvars[col].index) + quad_coeff[key] = quad_coeff.get(key, 0.0) + value + + quadratic_row_indices = [] + quadratic_col_indices = [] + quadratic_values = [] + for (row, col), value in sorted(quad_coeff.items()): + if value != 0.0: + quadratic_row_indices.append(row) + quadratic_col_indices.append(col) + quadratic_values.append(value) + + return ( + linear_values, + linear_indices, + quadratic_values, + quadratic_row_indices, + quadratic_col_indices, + rhs_value, + ) class LinearExpression: @@ -1169,6 +1257,8 @@ def __le__(self, other): # expr1 <= expr2 -> expr1 - expr2 <= 0 expr = self - other return Constraint(expr, LE, 0.0) + case QuadraticExpression(): + return Constraint(self - other, LE, 0.0) def __ge__(self, other): match other: @@ -1178,6 +1268,8 @@ def __ge__(self, other): # expr1 >= expr2 -> expr1 - expr2 >= 0 expr = self - other return Constraint(expr, GE, 0.0) + case QuadraticExpression(): + return Constraint(self - other, GE, 0.0) def __eq__(self, other): match other: @@ -1191,16 +1283,15 @@ def __eq__(self, other): class Constraint: """ - cuOpt constraint object containing a linear expression, - the sense of the constraint, and the right-hand side of - the constraint. - Constraints are associated with a problem and can be - created using :py:meth:`Problem.addConstraint`. + cuOpt constraint object containing a linear or quadratic (QCMATRIX) + expression, the sense of the constraint, and the right-hand side. + Constraints are associated with a problem and can be created using + :py:meth:`Problem.addConstraint`. Parameters ---------- - expr : LinearExpression - Linear expression corresponding to a problem. + expr : LinearExpression or QuadraticExpression + Expression corresponding to the constraint. sense : enum Sense of the constraint. Either LE for <=, GE for >= or EQ for == . @@ -1216,7 +1307,9 @@ class Constraint: Sense : LE, GE or EQ Row sense. LE for <=, GE for >= or EQ for == . RHS : float - Constraint right-hand side value. + Constraint right-hand side value (linear rows). + is_quadratic : bool + True when the row is exported as a QCMATRIX quadratic constraint. Slack : float Computed LHS - RHS with current solution. DualValue : float @@ -1224,10 +1317,37 @@ class Constraint: """ def __init__(self, expr, sense, rhs, name=""): + self.index = -1 + self.Sense = sense + self.ConstraintName = name + self.DualValue = float("nan") + self.Slack = float("nan") + + if isinstance(expr, QuadraticExpression): + self.is_quadratic = True + ( + linear_values, + linear_indices, + quadratic_values, + quadratic_row_indices, + quadratic_col_indices, + rhs_value, + ) = _quadratic_expression_to_qcmatrix(expr, rhs) + self.linear_values = np.array(linear_values, dtype=np.float64) + self.linear_indices = np.array(linear_indices, dtype=np.int32) + self.vals = np.array(quadratic_values, dtype=np.float64) + self.rows = np.array(quadratic_row_indices, dtype=np.int32) + self.cols = np.array(quadratic_col_indices, dtype=np.int32) + self.rhs_value = rhs_value + self.RHS = rhs_value + self.vindex_coeff_dict = {} + self.vars = expr.vars + return + + self.is_quadratic = False self.vindex_coeff_dict = {} nz = len(expr) self.vars = expr.vars - self.index = -1 for i in range(nz): v_idx = expr.vars[i].index v_coeff = expr.coefficients[i] @@ -1236,11 +1356,7 @@ def __init__(self, expr, sense, rhs, name=""): if v_idx in self.vindex_coeff_dict else v_coeff ) - self.Sense = sense self.RHS = rhs - expr.getConstant() - self.ConstraintName = name - self.DualValue = float("nan") - self.Slack = float("nan") def __len__(self): return len(self.vindex_coeff_dict) @@ -1273,9 +1389,12 @@ def getCoefficient(self, var): def compute_slack(self): # Computes the constraint Slack in the current solution. - lhs = 0.0 - for var in self.vars: - lhs += var.Value * self.vindex_coeff_dict[var.index] + index_to_var = {var.index: var for var in self.vars} + lhs = sum( + index_to_var[v_idx].Value * coeff + for v_idx, coeff in self.vindex_coeff_dict.items() + ) + return self.RHS - lhs @@ -1420,6 +1539,8 @@ def _to_data_model(self): "values": [], } for constr in self.constrs: + if constr.is_quadratic: + continue csr_dict["column_indices"].extend( list(constr.vindex_coeff_dict.keys()) ) @@ -1439,6 +1560,8 @@ def _to_data_model(self): else: for constr in self.constrs: + if constr.is_quadratic: + continue self.rhs.append(constr.RHS) self.row_sense.append(constr.Sense) @@ -1484,6 +1607,23 @@ def _to_data_model(self): dm.set_row_names(self.row_names) dm.set_problem_name(self.Name) + for constr in self.constrs: + if not constr.is_quadratic: + continue + row_name = constr.ConstraintName + if row_name == "": + row_name = "Q" + str(constr.index) + dm.add_quadratic_constraint( + constraint_row_name=row_name, + linear_values=constr.linear_values, + linear_indices=constr.linear_indices, + rhs_value=constr.rhs_value, + vals=constr.vals, + rows=constr.rows, + cols=constr.cols, + sense=constr.Sense, + ) + if self.mip_start.size > 0 and not np.all(np.isnan(self.mip_start)): dm.set_initial_primal_solution(self.mip_start) @@ -1554,15 +1694,15 @@ def addVariable( def addConstraint(self, constr, name=""): """ Adds a constraint to the problem defined by constraint object - and name. A constraint is generated using LinearExpression, - Sense and RHS. + and name. A constraint is generated using LinearExpression or + QuadraticExpression comparisons (``<=``, ``>=``, or ``==``). Parameters ---------- constr : :py:class:`Constraint` - Constructed using LinearExpressions (See Examples) + Constructed using expression comparisons (see Examples). name : string - Name of the variable. Optional. + Name of the constraint. Optional. Examples -------- @@ -1572,6 +1712,7 @@ def addConstraint(self, constr, name=""): >>> problem.addConstraint(2*x - 3*y <= 10, name="Constr1") >>> expr = 3*x + y >>> problem.addConstraint(expr + x == 20, name="Constr2") + >>> problem.addConstraint(-x*x + y*y <= 0, name="soc") """ if self.solved: self.reset_solved_values() # Reset all solved values @@ -1611,6 +1752,10 @@ def updateConstraint(self, constr, coeffs=[], rhs=None): """ self.reset_solved_values() if isinstance(constr, Constraint): + if constr.is_quadratic: + raise ValueError( + "updateConstraint applies to linear constraints only" + ) if isinstance(coeffs, dict): coeffs = coeffs.items() for var, coeff in coeffs: @@ -1911,6 +2056,12 @@ def NumConstraints(self): # Returns number of contraints in the problem. return len(self.constrs) + def getQuadraticConstraints(self): + """ + Returns all quadratic (QCMATRIX) constraints in the problem. + """ + return [c for c in self.constrs if c.is_quadratic] + @property def NumNZs(self): # Returns number of non-zeros in the problem. @@ -1957,6 +2108,8 @@ def getCSR(self): return self.dict_to_object(self.constraint_csr_matrix) csr_dict = {"row_pointers": [0], "column_indices": [], "values": []} for constr in self.constrs: + if constr.is_quadratic: + continue csr_dict["column_indices"].extend( list(constr.vindex_coeff_dict.keys()) ) @@ -2033,10 +2186,14 @@ def populate_solution(self, solution): dual_sol = None if not IsMIP: dual_sol = solution.get_dual_solution() - for i, constr in enumerate(self.constrs): - if dual_sol is not None and len(dual_sol) > 0: - constr.DualValue = dual_sol[i] + linear_row = 0 + for constr in self.constrs: + if constr.is_quadratic: + continue + if dual_sol is not None and len(dual_sol) > linear_row: + constr.DualValue = dual_sol[linear_row] constr.Slack = constr.compute_slack() + linear_row += 1 self.solved = True def solve(self, settings=solver_settings.SolverSettings()): @@ -2061,3 +2218,4 @@ def solve(self, settings=solver_settings.SolverSettings()): solution = solver.Solve(self.model, settings) # Post Solve self.populate_solution(solution) + return solution diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py index 5e7ab6e94c..860b7aef2a 100644 --- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py +++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py @@ -150,6 +150,16 @@ def test_model(): assert prob.ObjValue == pytest.approx(5 * x.Value + 3 * y.Value + 70) +def test_constraint_duplicate_terms_slack(): + """Merged coeffs in vindex_coeff_dict must not be double-counted in slack.""" + prob = Problem() + x = prob.addVariable() + c = prob.addConstraint(5 * x + 7 * x <= 18) + assert c.getCoefficient(x) == 12 + x.Value = 1.0 + assert c.compute_slack() == pytest.approx(6.0) + + def test_semi_continuous_variable(): prob = Problem("Semi-continuous") x = prob.addVariable(lb=5.0, ub=10.0, vtype=SEMI_CONTINUOUS, name="x") diff --git a/python/cuopt/cuopt/tests/socp/test_socp.py b/python/cuopt/cuopt/tests/socp/test_socp.py new file mode 100644 index 0000000000..a406f1a2e6 --- /dev/null +++ b/python/cuopt/cuopt/tests/socp/test_socp.py @@ -0,0 +1,167 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Barrier SOCP tests via the Problem Python API. + +Checks that the barrier solution is mapped back to the original model variables +after SOC conversion (see ``project_barrier_solution_to_model_variables`` in +``cpp/src/barrier/translate_soc.hpp``). +""" + +from __future__ import annotations + +import numpy as np +import pytest + +from cuopt.linear_programming.problem import EQ, GE, LE, Problem +from cuopt.linear_programming.solver.solver_parameters import CUOPT_METHOD +from cuopt.linear_programming.solver_settings import ( + SolverMethod, + SolverSettings, +) + +EXPECTED_SOCP_1_OBJECTIVE = -13.548638904065102 +EXPECTED_SOCP_1_X = (-3.874621860638774, -2.129788233677883, 2.33480343377204) +EXPECTED_SOCP_1_Y = 5.0 + +EXPECTED_SOCP_3_OBJECTIVE = -1.932105 +EXPECTED_SOCP_3_X = (0.83666003, -0.54772256) + +OBJ_TOL = 1e-6 +PRIMAL_TOL = 1e-6 +FEAS_TOL = 1e-6 + + +def _barrier_settings() -> SolverSettings: + settings = SolverSettings() + settings.set_parameter(CUOPT_METHOD, SolverMethod.Barrier) + return settings + + +def _soc_two_dim_constraint(problem, x0, x1, mat, head) -> None: + """Encode ||mat @ [x0, x1]||_2 <= head as a standard Lorentz cone in (head, z0, z1).""" + z0 = problem.addVariable(lb=-np.inf) + z1 = problem.addVariable(lb=-np.inf) + problem.addConstraint(z0 == mat[0, 0] * x0 + mat[0, 1] * x1) + problem.addConstraint(z1 == mat[1, 0] * x0 + mat[1, 1] * x1) + problem.addConstraint(z0 * z0 + z1 * z1 - head * head <= 0) + + +def build_socp_1() -> tuple[Problem, tuple]: + """Min 3*x0+2*x1+x2 s.t. ||x||_2 <= y, x0+x1+3*x2 >= 1, 0 <= y <= 5.""" + problem = Problem("socp_1") + x0 = problem.addVariable(lb=-np.inf, name="x0") + x1 = problem.addVariable(lb=-np.inf, name="x1") + x2 = problem.addVariable(lb=-np.inf, name="x2") + y = problem.addVariable(lb=0, name="y") + problem.setObjective(3 * x0 + 2 * x1 + x2) + problem.addConstraint(y >= 0) + problem.addConstraint(x0 * x0 + x1 * x1 + x2 * x2 - y * y <= 0) + problem.addConstraint(x0 + x1 + 3 * x2 >= 1) + problem.addConstraint(y <= 5) + return problem, (x0, x1, x2, y) + + +def build_socp_3() -> tuple[Problem, tuple]: + """Min -x0+2*x1 s.t. ||M_i x||_2 <= 1 for three fixed 2x2 maps M_i.""" + root2 = np.sqrt(2.0) + u = np.array([[1 / root2, -1 / root2], [1 / root2, 1 / root2]]) + mat1 = np.diag([root2, 1 / root2]) @ u.T + mat2 = np.diag([1.0, 1.0]) + mat3 = np.diag([0.2, 1.8]) + + problem = Problem("socp_3") + x0 = problem.addVariable(lb=-np.inf, name="x0") + x1 = problem.addVariable(lb=-np.inf, name="x1") + problem.setObjective(-x0 + 2 * x1) + h1 = problem.addVariable(lb=1, ub=1, name="h1") + h2 = problem.addVariable(lb=1, ub=1, name="h2") + h3 = problem.addVariable(lb=1, ub=1, name="h3") + _soc_two_dim_constraint(problem, x0, x1, mat1, h1) + _soc_two_dim_constraint(problem, x0, x1, mat2, h2) + _soc_two_dim_constraint(problem, x0, x1, mat3, h3) + return problem, (x0, x1, h1, h2, h3) + + +def _quadratic_constraint_violation(constr, variables) -> float: + """QCMATRIX row value minus rhs (should be <= 0 for L rows).""" + vals = [var.Value for var in variables] + quad = 0.0 + for k in range(len(constr.vals)): + i = int(constr.rows[k]) + j = int(constr.cols[k]) + quad += float(constr.vals[k]) * vals[i] * vals[j] + lin = 0.0 + for k in range(len(constr.linear_values)): + lin += ( + float(constr.linear_values[k]) + * vals[int(constr.linear_indices[k])] + ) + return quad + lin - float(constr.rhs_value) + + +def _assert_solution_on_original_model(problem: Problem, solution) -> None: + primal = solution.get_primal_solution() + assert len(primal) == problem.NumVariables + assert problem.ObjValue == pytest.approx( + solution.get_primal_objective(), rel=0, abs=OBJ_TOL + ) + assert problem.ObjValue == pytest.approx( + problem.getObjective().getValue(), rel=0, abs=OBJ_TOL + ) + + +def _assert_feasible(problem: Problem) -> None: + variables = problem.getVariables() + for constr in problem.getConstraints(): + if constr.is_quadratic: + assert ( + _quadratic_constraint_violation(constr, variables) <= FEAS_TOL + ) + continue + slack = constr.compute_slack() + if constr.Sense == LE: + assert slack >= -FEAS_TOL + elif constr.Sense == GE: + assert slack <= FEAS_TOL + else: + assert constr.Sense == EQ + assert slack == pytest.approx(0.0, abs=FEAS_TOL) + + +def _solve(problem: Problem): + solution = problem.solve(_barrier_settings()) + assert problem.Status.name == "Optimal" + return solution + + +def test_socp_1_barrier_solution(): + problem, (x0, x1, x2, y) = build_socp_1() + solution = _solve(problem) + _assert_solution_on_original_model(problem, solution) + _assert_feasible(problem) + + assert problem.ObjValue == pytest.approx( + EXPECTED_SOCP_1_OBJECTIVE, abs=OBJ_TOL + ) + assert x0.Value == pytest.approx(EXPECTED_SOCP_1_X[0], abs=PRIMAL_TOL) + assert x1.Value == pytest.approx(EXPECTED_SOCP_1_X[1], abs=PRIMAL_TOL) + assert x2.Value == pytest.approx(EXPECTED_SOCP_1_X[2], abs=PRIMAL_TOL) + assert y.Value == pytest.approx(EXPECTED_SOCP_1_Y, abs=PRIMAL_TOL) + + +def test_socp_3_barrier_solution(): + problem, (x0, x1, h1, h2, h3) = build_socp_3() + solution = _solve(problem) + _assert_solution_on_original_model(problem, solution) + _assert_feasible(problem) + + assert problem.ObjValue == pytest.approx( + EXPECTED_SOCP_3_OBJECTIVE, abs=OBJ_TOL + ) + assert x0.Value == pytest.approx(EXPECTED_SOCP_3_X[0], abs=PRIMAL_TOL) + assert x1.Value == pytest.approx(EXPECTED_SOCP_3_X[1], abs=PRIMAL_TOL) + assert h1.Value == pytest.approx(1.0, abs=PRIMAL_TOL) + assert h2.Value == pytest.approx(1.0, abs=PRIMAL_TOL) + assert h3.Value == pytest.approx(1.0, abs=PRIMAL_TOL)