InfiniTensor
diff --git a/‎=0.34.0,‎ b/‎=0.34.0,‎
diff --git a/‎csrc/engine/infer_engine.cpp‎
Lines changed: 41 additions & 4 deletions b/‎csrc/engine/infer_engine.cpp‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎csrc/engine/infer_engine.hpp‎
Lines changed: 20 additions & 0 deletions b/‎csrc/engine/infer_engine.hpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎csrc/engine/rank_worker.cpp‎
Lines changed: 37 additions & 1 deletion b/‎csrc/engine/rank_worker.cpp‎
Lines changed: 37 additions & 1 deletion
diff --git a/‎csrc/engine/rank_worker.hpp‎
Lines changed: 8 additions & 2 deletions b/‎csrc/engine/rank_worker.hpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎csrc/layers/fused_linear.cpp‎
Lines changed: 103 additions & 14 deletions b/‎csrc/layers/fused_linear.cpp‎
Lines changed: 103 additions & 14 deletions
@@ -1,27 +1,65 @@
 #include "infer_engine.hpp"
 #include "spdlog/spdlog.h"
-#include <iostream>
 
 namespace infinilm::engine {
 
 //------------------------------------------------------
 // Constructor
 //------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 InferEngine::InferEngine(
+    const InfinilmModel::Config &config,
     const distributed::DistConfig &distributed_config,
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
+    bool enable_graph_compiling) // Changed parameter
+    : communication_group_(distributed_config, device_type),
+      legacy_model_config_(config) {
+
+    if (cache_config != nullptr) {
+        cache_config_ = cache_config->unique_copy();
+    }
+    // Create one RankWorker per rank
+    int world_size = communication_group_.get_world_size();
+    barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
+    workers_.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+        workers_.emplace_back(std::make_unique<RankWorker>(
+            legacy_model_config_,
+            communication_group_.get_rank_info(r),
+            cache_config_ != nullptr ? cache_config_.get() : nullptr,
+            barrier_.get(),
+            enable_graph_compiling));
+    }
+
+    // Compile the model on all workers
+    this->compile();
+}
+
+InferEngine::InferEngine(
     const std::string &model_path,
+    const distributed::DistConfig &distributed_config,
+    infinicore::Device::Type device_type,
+    const cache::CacheConfig *cache_config,
     bool enable_graph_compiling) // Changed parameter
     : communication_group_(distributed_config, device_type) {
-
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
     }
 
     // Load model config if model_path is provided, model_path must be valid, and config.json exists
     this->model_config_ = std::make_shared<infinilm::config::ModelConfig>(model_path + "/config.json");
-
     // Create one RankWorker per rank
     int world_size = communication_group_.get_world_size();
     barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
@@ -34,7 +72,6 @@ InferEngine::InferEngine(
             barrier_.get(),
             enable_graph_compiling));
     }
-
     // Compile the model on all workers
     this->compile();
 }
 
@@ -20,11 +20,30 @@ class InferEngine {
     using Output = RankWorker::Output;
 
     // Updated constructor: accept CacheConfig instead of CacheType
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     InferEngine(
+        const InfinilmModel::Config &config,
         const distributed::DistConfig &distributed_config = distributed::DistConfig(),
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
+        bool enable_graph_compiling = false);
+
+    InferEngine(
         const std::string &model_path = "",
+        const distributed::DistConfig &distributed_config = distributed::DistConfig(),
+        infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
+        const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false);
 
     // Load a parameter to all workers (each can extract its shard inside RankWorker)
@@ -52,6 +71,7 @@ class InferEngine {
     std::unique_ptr<RankBarrier> barrier_;
     distributed::CommunicationGroup communication_group_;
     std::unique_ptr<cache::CacheConfig> cache_config_;
+    const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
     std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 };
 
 
@@ -4,12 +4,48 @@
 
 #include "infinicore/ops.hpp"
 
-#include <iostream>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
 
 namespace infinilm::engine {
 
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
+RankWorker::RankWorker(const InfinilmModel::Config &model_config,
+                       const distributed::RankInfo &rank_info,
+                       const cache::CacheConfig *cache_config,
+                       RankBarrier *barrier,
+                       bool enable_graph_compiling)
+    : legacy_model_config_(model_config),
+      rank_info_(rank_info),
+      enable_graph_compiling_(enable_graph_compiling),
+      job_cmd_(Command::INIT),
+      has_job_(false),
+      job_done_(false),
+      should_exit_(false),
+      init_done_(false),
+      barrier_(barrier) {
+    if (cache_config != nullptr) {
+        pending_cache_config_ = cache_config->unique_copy();
+    }
+    // start the thread
+    thread_ = std::thread(&RankWorker::thread_loop, this);
+
+    // Wait until the worker thread finishes initialization (model created)
+    std::unique_lock<std::mutex> lk(mutex_);
+    cv_.wait(lk, [&] { return init_done_; });
+}
+
 RankWorker::RankWorker(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
     const distributed::RankInfo &rank_info,
 
@@ -57,6 +57,12 @@ class RankWorker {
         infinicore::Tensor output_ids;
     };
 
+    RankWorker(const InfinilmModel::Config &model_config,
+               const distributed::RankInfo &rank_info,
+               const cache::CacheConfig *cache_config,
+               RankBarrier *barrier,
+               bool enable_graph_compiling);
+
     RankWorker(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                const distributed::RankInfo &rank_info,
                const cache::CacheConfig *cache_config,
@@ -95,11 +101,11 @@ class RankWorker {
 
 private:
     // Worker properties
-    // const InfinilmModel::Config &model_config_;
+    const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     distributed::RankInfo rank_info_;
     std::shared_ptr<InfinilmModel> model_;
     std::shared_ptr<cache::Cache> cache_;
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 
     // Graph Compiling
     bool enable_graph_compiling_;
 
@@ -6,39 +6,102 @@ namespace infinilm::layers {
 // ---------------------------------------------------------
 // QKV Parallel Linear
 // ---------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
                                      size_t head_dim,
                                      size_t num_q_head,
                                      size_t num_kv_head,
                                      bool bias,
                                      const infinicore::DataType &dtype,
                                      const infinicore::Device &device,
-                                     engine::distributed::RankInfo rank_info,
-                                     std::optional<infinicore::nn::QuantScheme> quant_scheme)
+                                     engine::distributed::RankInfo rank_info)
     : QKVParallelLinear(hidden_size,
                         head_dim, head_dim, head_dim,
                         num_q_head, num_kv_head, num_kv_head,
                         bias, bias, bias,
-                        dtype, device, rank_info,
-                        quant_scheme) {}
+                        dtype, device, rank_info) {}
 
 QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
                                      size_t q_dim, size_t k_dim, size_t v_dim,
                                      size_t num_q_head, size_t num_k_head, size_t num_v_head,
                                      bool q_bias, bool k_bias, bool v_bias,
                                      const infinicore::DataType &dtype,
                                      const infinicore::Device &device,
-                                     engine::distributed::RankInfo rank_info,
-                                     std::optional<infinicore::nn::QuantScheme> quant_scheme)
+                                     engine::distributed::RankInfo rank_info)
     : infinicore::nn::ColumnParallelLinear(
           hidden_size,
           num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
           (q_bias || k_bias || v_bias),
           dtype,
           device,
           rank_info.tp_rank,
-          rank_info.tp_size,
-          quant_scheme),
+          rank_info.tp_size),
+      q_dim_(q_dim),
+      k_dim_(k_dim),
+      v_dim_(v_dim),
+      num_q_head_(num_q_head),
+      num_k_head_(num_k_head),
+      num_v_head_(num_v_head),
+      q_bias_(q_bias),
+      k_bias_(k_bias),
+      v_bias_(v_bias) {
+    if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) {
+        throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size");
+    }
+
+    if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) {
+        throw std::runtime_error("q_bias, k_bias, v_bias must all match");
+    }
+
+    q_out_size_ = num_q_head_ * q_dim_ / tp_size_;
+    k_out_size_ = num_k_head_ * k_dim_ / tp_size_;
+    v_out_size_ = num_v_head_ * v_dim_ / tp_size_;
+}
+
+QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
+                                     size_t head_dim,
+                                     size_t num_q_head,
+                                     size_t num_kv_head,
+                                     infinicore::nn::QuantScheme quant_scheme,
+                                     bool bias,
+                                     const infinicore::DataType &dtype,
+                                     const infinicore::Device &device,
+                                     engine::distributed::RankInfo rank_info)
+    : QKVParallelLinear(hidden_size,
+                        head_dim, head_dim, head_dim,
+                        num_q_head, num_kv_head, num_kv_head,
+                        bias, bias, bias,
+                        quant_scheme,
+                        dtype, device, rank_info) {}
+
+QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
+                                     size_t q_dim, size_t k_dim, size_t v_dim,
+                                     size_t num_q_head, size_t num_k_head, size_t num_v_head,
+                                     bool q_bias, bool k_bias, bool v_bias,
+                                     infinicore::nn::QuantScheme quant_scheme,
+                                     const infinicore::DataType &dtype,
+                                     const infinicore::Device &device,
+                                     engine::distributed::RankInfo rank_info)
+    : infinicore::nn::ColumnParallelLinear(
+          hidden_size,
+          num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
+          quant_scheme,
+          (q_bias || k_bias || v_bias),
+          dtype,
+          device,
+          rank_info.tp_rank,
+          rank_info.tp_size),
       q_dim_(q_dim),
       k_dim_(k_dim),
       v_dim_(v_dim),
@@ -141,18 +204,44 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; }
 // ---------------------------------------------------------
 // Gate-Up Parallel Linear
 // ---------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias,
                                            const infinicore::DataType &dtype, const infinicore::Device &device,
-                                           engine::distributed::RankInfo rank_info,
-                                           std::optional<infinicore::nn::QuantScheme> quant_scheme)
-    : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info, quant_scheme) {
+                                           engine::distributed::RankInfo rank_info)
+    : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) {
+}
+
+GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
+                                           const infinicore::DataType &dtype, const infinicore::Device &device,
+                                           engine::distributed::RankInfo rank_info)
+    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
+    if (gate_bias_ != up_bias_) {
+        throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
+    }
+}
+
+GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, infinicore::nn::QuantScheme quant_scheme, bool bias,
+                                           const infinicore::DataType &dtype, const infinicore::Device &device,
+                                           engine::distributed::RankInfo rank_info)
+    : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quant_scheme, dtype, device, rank_info) {
 }
 
 GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
+                                           infinicore::nn::QuantScheme quant_scheme,
                                            const infinicore::DataType &dtype, const infinicore::Device &device,
-                                           engine::distributed::RankInfo rank_info,
-                                           std::optional<infinicore::nn::QuantScheme> quant_scheme)
-    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size, quant_scheme), gate_bias_(gate_bias), up_bias_(up_bias) {
+                                           engine::distributed::RankInfo rank_info)
+    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quant_scheme, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
     if (gate_bias_ != up_bias_) {
         throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
     }