NVIDIA · lhb8125 · Jun 5, 2026 · Jun 5, 2026 · greptile-apps · Jun 5, 2026
diff --git a/docs/envvars.rst b/docs/envvars.rst
@@ -119,6 +119,25 @@ Runtime Environment Variables
 
 These environment variables control the behavior of Transformer Engine during execution.
 
+General
+^^^^^^^
+
+.. envvar:: NVTE_TENSOR_HANDLE_POOL_SIZE_MB
+
+   :Type: ``int`` (positive integer)
+   :Default: ``20``
+   :Description: Size in MiB of the internal ``NVTETensor`` handle pool. Increase this
+                 value if an application legitimately creates more tensor handles than
+                 the default pool can hold.
+
+.. envvar:: NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB
+
+   :Type: ``int`` (positive integer)
+   :Default: ``20``
+   :Description: Size in MiB of the internal ``NVTEGroupedTensor`` handle pool. Increase
+                 this value if an application legitimately creates more grouped tensor
+                 handles than the default pool can hold.
+
 Attention Backend Selection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
@@ -12,6 +12,7 @@
 #include <climits>
 #include <cstring>
 #include <iostream>
+#include <limits>
 #include <mutex>
 #include <optional>
 #include <string>
@@ -21,6 +22,7 @@
 #include "common.h"
 #include "common/util/cuda_runtime.h"
 #include "common/util/logging.h"
+#include "common/util/system.h"
 
 namespace transformer_engine {
 
@@ -393,6 +395,29 @@ void CheckOutputGroupedTensor(const GroupedTensor &t, std::string_view name, boo
   CheckGroupedTensorShapeArrays(t, name);
 }
 
+namespace {
+
+constexpr size_t kDefaultTensorHandlePoolSizeMB = 20;
+constexpr size_t kBytesPerMB = 1024 * 1024;
+
+size_t GetTensorHandlePoolSizeMB(const char *env_var) {
+  const size_t pool_size_mb = getenv<size_t>(env_var, kDefaultTensorHandlePoolSizeMB);
+  NVTE_CHECK(pool_size_mb > 0, env_var, " must be a positive integer.");
+  NVTE_CHECK(pool_size_mb <= std::numeric_limits<size_t>::max() / kBytesPerMB, env_var,
+             " is too large.");
+  return pool_size_mb;
+}
+
+size_t GetTensorHandlePoolCapacity(size_t pool_size_mb, size_t handle_size, const char *handle_name,
+                                   const char *env_var) {
+  const size_t pool_size_bytes = pool_size_mb * kBytesPerMB;
+  NVTE_CHECK(pool_size_bytes >= handle_size, env_var, "=", pool_size_mb,
+             " MiB is too small for one ", handle_name, " handle of size ", handle_size, " bytes.");
+  return pool_size_bytes / handle_size;
+}
+
+}  // namespace
+
 class TensorAllocator {
  public:
   static TensorAllocator &instance() {
@@ -406,8 +431,10 @@ class TensorAllocator {
     std::lock_guard<std::mutex> lock(mutex);
     const size_t available = free_list.size() + (memory.capacity() - memory.size());
     NVTE_CHECK(available >= N, "Cannot allocate ", N,
-               " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM,
-               ". There is probably a memory leak in your application.");
+               " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, " (",
+               TENSOR_HANDLE_POOL_SIZE_MB,
+               " MiB handle pool). If your application legitimately needs more tensor handles, "
+               "increase NVTE_TENSOR_HANDLE_POOL_SIZE_MB.");
     for (size_t i = 0; i < N; ++i) {
       uintptr_t index;
       if (!free_list.empty()) {
@@ -479,9 +506,11 @@ class TensorAllocator {
 
   std::mutex mutex;
   std::atomic<size_t> size;
-  // Allocate at most 20 MB for tensors
   // Should be replaced by virtual memory allocation
-  const size_t MAX_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(Tensor);
+  const size_t TENSOR_HANDLE_POOL_SIZE_MB =
+      GetTensorHandlePoolSizeMB("NVTE_TENSOR_HANDLE_POOL_SIZE_MB");
+  const size_t MAX_TENSOR_NUM = GetTensorHandlePoolCapacity(
+      TENSOR_HANDLE_POOL_SIZE_MB, sizeof(Tensor), "NVTETensor", "NVTE_TENSOR_HANDLE_POOL_SIZE_MB");
   std::vector<uintptr_t> free_list;
   std::vector<Tensor> memory;
   bool debug = false;
@@ -532,7 +561,9 @@ class GroupedTensorAllocator {
     }
     NVTE_ERROR(
         "Cannot allocate a new NVTEGroupedTensor. Maximum number of grouped tensors reached: ",
-        MAX_GROUPED_TENSOR_NUM, ". There is probably a memory leak in your application.");
+        MAX_GROUPED_TENSOR_NUM, " (", GROUPED_TENSOR_HANDLE_POOL_SIZE_MB,
+        " MiB handle pool). If your application legitimately needs more grouped tensor handles, "
+        "increase NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB.");
   }
 
   void Free(NVTEGroupedTensor t) {
@@ -564,8 +595,11 @@ class GroupedTensorAllocator {
 
   std::mutex mutex;
   std::atomic<size_t> size;
-  // Allocate at most 20 MB for grouped tensors
-  const size_t MAX_GROUPED_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(GroupedTensor);
+  const size_t GROUPED_TENSOR_HANDLE_POOL_SIZE_MB =
+      GetTensorHandlePoolSizeMB("NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB");
+  const size_t MAX_GROUPED_TENSOR_NUM =
+      GetTensorHandlePoolCapacity(GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, sizeof(GroupedTensor),
+                                  "NVTEGroupedTensor", "NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB");
   std::vector<uintptr_t> free_list;
   std::vector<GroupedTensor> memory;
 };