InfiniTensor
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/cache/kv_cache.cpp‎
Lines changed: 23 additions & 13 deletions b/‎csrc/cache/kv_cache.cpp‎
Lines changed: 23 additions & 13 deletions
diff --git a/‎csrc/config/model_config.cpp‎
Lines changed: 88 additions & 0 deletions b/‎csrc/config/model_config.cpp‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎csrc/config/model_config.hpp‎
Lines changed: 71 additions & 0 deletions b/‎csrc/config/model_config.hpp‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎csrc/config/quant_config.cpp‎
Lines changed: 27 additions & 0 deletions b/‎csrc/config/quant_config.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎csrc/config/quant_config.hpp‎
Lines changed: 30 additions & 0 deletions b/‎csrc/config/quant_config.hpp‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎csrc/engine/compiler/general_compiler.cpp‎
Lines changed: 26 additions & 0 deletions b/‎csrc/engine/compiler/general_compiler.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎csrc/engine/compiler/general_compiler.hpp‎
Lines changed: 19 additions & 0 deletions b/‎csrc/engine/compiler/general_compiler.hpp‎
Lines changed: 19 additions & 0 deletions
@@ -29,3 +29,5 @@ __pycache__/
 *.txt
 
 *.http
+
+*.nsys-rep
@@ -1,3 +1,6 @@
 [submodule "third_party/spdlog"]
 	path = third_party/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/json"]
+	path = third_party/json
+	url = https://github.com/nlohmann/json.git
@@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
   - 单次推理测试
     - llama示例
     ```bash
-    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
     ```
     - 例如：
     ```bash
 
@@ -85,26 +85,36 @@ StaticKVCache::update(size_t layer_idx,
 
     auto batch_size = k->size(0);
     auto update_len = k->size(2);
-    size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
-    auto result_len = cache_pos + update_len;
-
-    ASSERT(result_len <= cache_len_);
 
     ASSERT_EQ(batch_size, rank_batch_size_);
 
     auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
     auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
 
-    auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
-    auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
-
-    k_cache_update->copy_from(k);
-    v_cache_update->copy_from(v);
-
-    auto k_total = k_cache_layer->narrow({{2, 0, result_len}});
-    auto v_total = v_cache_layer->narrow({{2, 0, result_len}});
+    auto device = k_cache_layer->device();
+
+    if (device.getType() == infinicore::Device::Type::NVIDIA
+        || device.getType() == infinicore::Device::Type::ILUVATAR
+        || device.getType() == infinicore::Device::Type::METAX) {
+        infinicore::op::kv_caching_(
+            k_cache_layer,
+            v_cache_layer,
+            k,
+            v,
+            past_sequence_lengths);
+    } else {
+        size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
+        auto result_len = cache_pos + update_len;
+        ASSERT(result_len <= cache_len_);
+
+        auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
+        auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
+
+        k_cache_update->copy_from(k);
+        v_cache_update->copy_from(v);
+    }
 
-    return {k_total, v_total};
+    return {k_cache_layer, v_cache_layer};
 }
 
 // ==========================
 
@@ -0,0 +1,88 @@
+#include "model_config.hpp"
+
+namespace infinilm::config {
+ModelConfig::ModelConfig(const std::string &path) {
+    std::ifstream file(path);
+    if (file.is_open()) {
+        file >> config_json;
+        file.close();
+    } else {
+        throw std::runtime_error("Could not open config file: " + path);
+    }
+    this->quant_config = QuantConfig(config_json["quantization_config"]);
+}
+
+infinicore::quantization::QuantScheme
+ModelConfig::get_quant_scheme() const {
+    if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) {
+        return quant_config.get_quant_scheme();
+    } else {
+        return infinicore::quantization::QuantScheme::NONE;
+    }
+}
+
+std::shared_ptr<infinicore::nn::RoPE::ScalingConfig>
+ModelConfig::get_rope_scaling() const {
+    if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) {
+        return nullptr;
+    }
+
+    const auto &rope_scaling = config_json["rope_scaling"];
+    if (!rope_scaling.is_object()) {
+        throw std::runtime_error("rope_scaling must be an object");
+    }
+
+    if (!rope_scaling.contains("type")) {
+        throw std::runtime_error("rope_scaling must contain 'type' field");
+    }
+
+    std::string type_str = rope_scaling["type"].get<std::string>();
+    if (type_str == "longrope") {
+        // Required fields for LongRopeConfig
+        if (!rope_scaling.contains("short_factor") || !rope_scaling.contains("long_factor") || !rope_scaling.contains("original_max_position_embeddings")) {
+            throw std::runtime_error(
+                "LongRopeConfig requires 'short_factor', 'long_factor', and 'original_max_position_embeddings'");
+        }
+
+        auto short_factor = rope_scaling["short_factor"].get<std::vector<float>>();
+        auto long_factor = rope_scaling["long_factor"].get<std::vector<float>>();
+        size_t original_max_position_embeddings = rope_scaling["original_max_position_embeddings"].get<size_t>();
+
+        float factor = 1.0f;
+        if (rope_scaling.contains("factor")) {
+            factor = rope_scaling["factor"].get<float>();
+        }
+
+        return std::make_shared<infinicore::nn::RoPE::LongRopeConfig>(
+            std::move(short_factor),
+            std::move(long_factor),
+            original_max_position_embeddings,
+            factor);
+    } else if (type_str == "default" || type_str == "none") {
+        // Default scaling, no scaling applied
+        return nullptr;
+    } else {
+        throw std::runtime_error("Unsupported rope_scaling type: " + type_str);
+    }
+}
+
+infinicore::DataType
+ModelConfig::get_dtype() const {
+    try {
+        std::string dtype_str = this->get<std::string>("torch_dtype");
+        if (dtype_str == "float32") {
+            return infinicore::DataType::F32;
+        } else if (dtype_str == "float16") {
+            return infinicore::DataType::F16;
+        } else if (dtype_str == "bfloat16") {
+            return infinicore::DataType::BF16;
+        } else if (dtype_str == "int8") {
+            return infinicore::DataType::I8;
+        } else {
+            throw std::runtime_error("Unsupported dtype string: " + dtype_str);
+        }
+    } catch (const std::exception &e) {
+        throw std::runtime_error("Error getting dtype from config: " + std::string(e.what()));
+    }
+}
+} // namespace infinilm::config
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/ops.hpp"
+#include "quant_config.hpp"
+#include <fstream>
+#include <string>
+
+namespace infinilm::config {
+class ModelConfig {
+    // Model config is implemented using nlohmann/json and is primarily used for advanced configuration
+    // beyond the standard model config. It is initialized via ModelConfig(const std::string& path)
+    // and passed through the InferEngine during inference.
+public:
+    ModelConfig() = default;
+    // Not Implemented
+    // ModelConfig(const nlohmann::json &json) : config_json(json) {};
+    ModelConfig(const std::string &path);
+
+    // Template Function to get a value by key with type safety
+    template <typename T>
+    T get(const std::string &key) const {
+        if (!config_json.contains(key)) {
+            throw std::out_of_range("Key '" + key + "' not found in config.");
+        }
+        try {
+            return config_json.at(key).get<T>();
+        } catch (const nlohmann::json::type_error &e) {
+            throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what()));
+        }
+    }
+
+    template <typename T>
+    T get_or(const std::string &key, const T &default_value) const {
+        if (!config_json.contains(key) || config_json.at(key).is_null()) {
+            return default_value;
+        }
+        try {
+            return config_json.at(key).get<T>();
+        } catch (const nlohmann::json::type_error &) {
+            // If type conversion fails, return default value
+            return default_value;
+        }
+    }
+    size_t get_kv_dim() const {
+        return get<size_t>("hidden_size") * get<size_t>("num_key_value_heads") / get<size_t>("num_attention_heads");
+    }
+    size_t get_head_dim() const {
+        if (config_json.contains("head_dim")) {
+            return get<size_t>("head_dim");
+        }
+        return get<size_t>("hidden_size") / get<size_t>("num_attention_heads");
+    }
+
+    QuantConfig get_quant_config() const {
+        return quant_config;
+    }
+
+    std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const {
+        return quant_config.get_quantization_method();
+    }
+
+    infinicore::DataType get_dtype() const;
+    infinicore::quantization::QuantScheme get_quant_scheme() const;
+    std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
+
+private:
+    nlohmann::json config_json;
+    QuantConfig quant_config;
+};
+} // namespace infinilm::config
@@ -0,0 +1,27 @@
+#include "quant_config.hpp"
+
+namespace infinilm::config {
+QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) {
+    this->quantization_method = get_quantization_method();
+}
+
+std::shared_ptr<infinicore::quantization::BaseQuantization>
+QuantConfig::get_quantization_method() const {
+    if (quantization_config.is_null()) {
+        // return nullptr;
+        return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
+    }
+
+    // Determine the quantization scheme from the JSON config
+    if (quantization_config["quant_method"] == "compressed-tensors") {
+        return std::make_shared<infinicore::quantization::CompressedTensors>(quantization_config);
+    } else if (quantization_config["quant_method"] == "awq") {
+        return std::make_shared<infinicore::quantization::AWQ>(quantization_config);
+    } else {
+        return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config);
+    }
+    // Add other schemes as needed
+
+    return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
+}
+} // namespace infinilm::config
@@ -0,0 +1,30 @@
+#pragma once
+// #include "../quantization/quantization.hpp"
+#include "infinicore/quantization.hpp"
+#include "nlohmann/json.hpp"
+
+namespace infinilm::config {
+
+class QuantConfig {
+    // QuantConfig is used to store and parse the "quantization" field from config.json.
+    // This is currently a basic version and will be extended in the future.
+public:
+    QuantConfig() = default;
+    QuantConfig(const nlohmann::json &json);
+
+    std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const;
+
+    infinicore::quantization::QuantScheme get_quant_scheme() const {
+        if (quantization_method != nullptr) {
+            return quantization_method->get_quant_scheme();
+        } else {
+            return infinicore::quantization::QuantScheme::NONE;
+        }
+    }
+
+private:
+    nlohmann::json quantization_config;
+    std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_method;
+};
+
+} // namespace infinilm::config
@@ -0,0 +1,26 @@
+#include "general_compiler.hpp"
+
+namespace infinilm::engine {
+GeneralCompiler::GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier) : GraphCompiler(model, barrier) {
+    static_batching_compiler_ = std::make_unique<StaticBatchingCompiler>(model_, barrier);
+    paged_compiler_ = std::make_unique<PagedCompiler>(model_, barrier);
+}
+
+void GeneralCompiler::compile() {
+    static_batching_compiler_->compile();
+    paged_compiler_->compile();
+}
+
+GeneralCompiler::Compiled GeneralCompiler::get_compiled(const InfinilmModel::Input &input) {
+    GeneralCompiler::Compiled result = {nullptr, nullptr};
+
+    // try each compiler, return the first valid result
+    result = static_batching_compiler_.get()->get_compiled(input);
+    if (std::get<0>(result) != nullptr && std::get<1>(result) != nullptr) {
+        return result;
+    }
+    result = paged_compiler_.get()->get_compiled(input);
+    return result;
+}
+
+} // namespace infinilm::engine
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "paged_compiler.hpp"
+#include "static_batching_compiler.hpp"
+
+namespace infinilm::engine {
+class GeneralCompiler : public GraphCompiler {
+public:
+    GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier);
+
+    void compile() override;
+
+    Compiled get_compiled(const InfinilmModel::Input &input) override;
+
+private:
+    std::unique_ptr<StaticBatchingCompiler> static_batching_compiler_;
+    std::unique_ptr<PagedCompiler> paged_compiler_;
+};
+} // namespace infinilm::engine
-Original file line number
+Diff line change
 *.txt
 *.http
++
 +*.nsys-rep