From ec4838814fe7432cb5957b3e4f3fb33b9228bbd0 Mon Sep 17 00:00:00 2001 From: hongbinl Date: Fri, 5 Jun 2026 00:16:35 -0700 Subject: [PATCH 1/4] Make NVTE tensor handle pool size configurable Signed-off-by: hongbinl --- docs/envvars.rst | 19 ++++++++ .../common/transformer_engine.cpp | 47 ++++++++++++++++--- 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/docs/envvars.rst b/docs/envvars.rst index bd62ccac46..3ead99bbd2 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -119,6 +119,25 @@ Runtime Environment Variables These environment variables control the behavior of Transformer Engine during execution. +General +^^^^^^^ + +.. envvar:: NVTE_TENSOR_HANDLE_POOL_SIZE_MB + + :Type: ``int`` (positive integer) + :Default: ``20`` + :Description: Size in MiB of the internal ``NVTETensor`` handle pool. Increase this + value if an application legitimately creates more tensor handles than + the default pool can hold. + +.. envvar:: NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB + + :Type: ``int`` (positive integer) + :Default: ``20`` + :Description: Size in MiB of the internal ``NVTEGroupedTensor`` handle pool. Increase + this value if an application legitimately creates more grouped tensor + handles than the default pool can hold. + Attention Backend Selection ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index b3179d38fd..d84edb59da 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include "common.h" #include "common/util/cuda_runtime.h" #include "common/util/logging.h" +#include "common/util/system.h" namespace transformer_engine { @@ -393,6 +395,30 @@ void CheckOutputGroupedTensor(const GroupedTensor &t, std::string_view name, boo CheckGroupedTensorShapeArrays(t, name); } +namespace { + +constexpr size_t kDefaultTensorHandlePoolSizeMB = 20; +constexpr size_t kBytesPerMB = 1024 * 1024; + +size_t GetTensorHandlePoolSizeMB(const char *env_var) { + const size_t pool_size_mb = getenv(env_var, kDefaultTensorHandlePoolSizeMB); + NVTE_CHECK(pool_size_mb > 0, env_var, " must be a positive integer."); + NVTE_CHECK(pool_size_mb <= std::numeric_limits::max() / kBytesPerMB, env_var, + " is too large."); + return pool_size_mb; +} + +size_t GetTensorHandlePoolCapacity(size_t pool_size_mb, size_t handle_size, + const char *handle_name, const char *env_var) { + const size_t pool_size_bytes = pool_size_mb * kBytesPerMB; + NVTE_CHECK(pool_size_bytes >= handle_size, env_var, "=", pool_size_mb, + " MiB is too small for one ", handle_name, " handle of size ", handle_size, + " bytes."); + return pool_size_bytes / handle_size; +} + +} // namespace + class TensorAllocator { public: static TensorAllocator &instance() { @@ -407,7 +433,9 @@ class TensorAllocator { const size_t available = free_list.size() + (memory.capacity() - memory.size()); NVTE_CHECK(available >= N, "Cannot allocate ", N, " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, - ". There is probably a memory leak in your application."); + " (", TENSOR_HANDLE_POOL_SIZE_MB, + " MiB handle pool). If your application legitimately needs more tensor handles, " + "increase NVTE_TENSOR_HANDLE_POOL_SIZE_MB."); for (size_t i = 0; i < N; ++i) { uintptr_t index; if (!free_list.empty()) { @@ -479,9 +507,11 @@ class TensorAllocator { std::mutex mutex; std::atomic size; - // Allocate at most 20 MB for tensors // Should be replaced by virtual memory allocation - const size_t MAX_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(Tensor); + const size_t TENSOR_HANDLE_POOL_SIZE_MB = + GetTensorHandlePoolSizeMB("NVTE_TENSOR_HANDLE_POOL_SIZE_MB"); + const size_t MAX_TENSOR_NUM = GetTensorHandlePoolCapacity( + TENSOR_HANDLE_POOL_SIZE_MB, sizeof(Tensor), "NVTETensor", "NVTE_TENSOR_HANDLE_POOL_SIZE_MB"); std::vector free_list; std::vector memory; bool debug = false; @@ -532,7 +562,9 @@ class GroupedTensorAllocator { } NVTE_ERROR( "Cannot allocate a new NVTEGroupedTensor. Maximum number of grouped tensors reached: ", - MAX_GROUPED_TENSOR_NUM, ". There is probably a memory leak in your application."); + MAX_GROUPED_TENSOR_NUM, " (", GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, + " MiB handle pool). If your application legitimately needs more grouped tensor handles, " + "increase NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB."); } void Free(NVTEGroupedTensor t) { @@ -564,8 +596,11 @@ class GroupedTensorAllocator { std::mutex mutex; std::atomic size; - // Allocate at most 20 MB for grouped tensors - const size_t MAX_GROUPED_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(GroupedTensor); + const size_t GROUPED_TENSOR_HANDLE_POOL_SIZE_MB = + GetTensorHandlePoolSizeMB("NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); + const size_t MAX_GROUPED_TENSOR_NUM = GetTensorHandlePoolCapacity( + GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, sizeof(GroupedTensor), "NVTEGroupedTensor", + "NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); std::vector free_list; std::vector memory; }; From c3a32854ceb55692df215d38f07ba1cb7e5af42c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 07:19:47 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/transformer_engine.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index d84edb59da..331651cd8d 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -408,12 +408,11 @@ size_t GetTensorHandlePoolSizeMB(const char *env_var) { return pool_size_mb; } -size_t GetTensorHandlePoolCapacity(size_t pool_size_mb, size_t handle_size, - const char *handle_name, const char *env_var) { +size_t GetTensorHandlePoolCapacity(size_t pool_size_mb, size_t handle_size, const char *handle_name, + const char *env_var) { const size_t pool_size_bytes = pool_size_mb * kBytesPerMB; NVTE_CHECK(pool_size_bytes >= handle_size, env_var, "=", pool_size_mb, - " MiB is too small for one ", handle_name, " handle of size ", handle_size, - " bytes."); + " MiB is too small for one ", handle_name, " handle of size ", handle_size, " bytes."); return pool_size_bytes / handle_size; } @@ -432,8 +431,8 @@ class TensorAllocator { std::lock_guard lock(mutex); const size_t available = free_list.size() + (memory.capacity() - memory.size()); NVTE_CHECK(available >= N, "Cannot allocate ", N, - " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, - " (", TENSOR_HANDLE_POOL_SIZE_MB, + " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, " (", + TENSOR_HANDLE_POOL_SIZE_MB, " MiB handle pool). If your application legitimately needs more tensor handles, " "increase NVTE_TENSOR_HANDLE_POOL_SIZE_MB."); for (size_t i = 0; i < N; ++i) { @@ -598,9 +597,9 @@ class GroupedTensorAllocator { std::atomic size; const size_t GROUPED_TENSOR_HANDLE_POOL_SIZE_MB = GetTensorHandlePoolSizeMB("NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); - const size_t MAX_GROUPED_TENSOR_NUM = GetTensorHandlePoolCapacity( - GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, sizeof(GroupedTensor), "NVTEGroupedTensor", - "NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); + const size_t MAX_GROUPED_TENSOR_NUM = + GetTensorHandlePoolCapacity(GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, sizeof(GroupedTensor), + "NVTEGroupedTensor", "NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); std::vector free_list; std::vector memory; }; From 5c9ee293b44662576c9442b4296d61d9f96a7361 Mon Sep 17 00:00:00 2001 From: hongbinl Date: Mon, 8 Jun 2026 08:22:16 -0700 Subject: [PATCH 3/4] Validate tensor handle pool env vars Signed-off-by: hongbinl --- .../common/transformer_engine.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index 331651cd8d..0fafc7b281 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include "common.h" #include "common/util/cuda_runtime.h" #include "common/util/logging.h" -#include "common/util/system.h" namespace transformer_engine { @@ -401,7 +401,27 @@ constexpr size_t kDefaultTensorHandlePoolSizeMB = 20; constexpr size_t kBytesPerMB = 1024 * 1024; size_t GetTensorHandlePoolSizeMB(const char *env_var) { - const size_t pool_size_mb = getenv(env_var, kDefaultTensorHandlePoolSizeMB); + const char *env_value = std::getenv(env_var); + if (env_value == nullptr || env_value[0] == '\0') { + return kDefaultTensorHandlePoolSizeMB; + } + + const std::string value(env_value); + constexpr const char *kWhitespace = " \t\n\r\f\v"; + const size_t first = value.find_first_not_of(kWhitespace); + const size_t last = value.find_last_not_of(kWhitespace); + NVTE_CHECK(first != std::string::npos, env_var, " must be a positive integer."); + + size_t pool_size_mb = 0; + for (size_t i = first; i <= last; ++i) { + NVTE_CHECK(value[i] >= '0' && value[i] <= '9', env_var, + " must be a positive integer, got \"", value, "\"."); + const size_t digit = static_cast(value[i] - '0'); + NVTE_CHECK(pool_size_mb <= (std::numeric_limits::max() - digit) / 10, env_var, + " is too large."); + pool_size_mb = pool_size_mb * 10 + digit; + } + NVTE_CHECK(pool_size_mb > 0, env_var, " must be a positive integer."); NVTE_CHECK(pool_size_mb <= std::numeric_limits::max() / kBytesPerMB, env_var, " is too large."); From 935054d3d2f3baef740d3eeb9773d2f7a25f7326 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:23:29 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/transformer_engine.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index 0fafc7b281..066ca1dc8c 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -414,8 +414,8 @@ size_t GetTensorHandlePoolSizeMB(const char *env_var) { size_t pool_size_mb = 0; for (size_t i = first; i <= last; ++i) { - NVTE_CHECK(value[i] >= '0' && value[i] <= '9', env_var, - " must be a positive integer, got \"", value, "\"."); + NVTE_CHECK(value[i] >= '0' && value[i] <= '9', env_var, " must be a positive integer, got \"", + value, "\"."); const size_t digit = static_cast(value[i] - '0'); NVTE_CHECK(pool_size_mb <= (std::numeric_limits::max() - digit) / 10, env_var, " is too large.");