diff --git a/docs/envvars.rst b/docs/envvars.rst index bd62ccac46..3ead99bbd2 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -119,6 +119,25 @@ Runtime Environment Variables These environment variables control the behavior of Transformer Engine during execution. +General +^^^^^^^ + +.. envvar:: NVTE_TENSOR_HANDLE_POOL_SIZE_MB + + :Type: ``int`` (positive integer) + :Default: ``20`` + :Description: Size in MiB of the internal ``NVTETensor`` handle pool. Increase this + value if an application legitimately creates more tensor handles than + the default pool can hold. + +.. envvar:: NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB + + :Type: ``int`` (positive integer) + :Default: ``20`` + :Description: Size in MiB of the internal ``NVTEGroupedTensor`` handle pool. Increase + this value if an application legitimately creates more grouped tensor + handles than the default pool can hold. + Attention Backend Selection ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp index b3179d38fd..331651cd8d 100644 --- a/transformer_engine/common/transformer_engine.cpp +++ b/transformer_engine/common/transformer_engine.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include "common.h" #include "common/util/cuda_runtime.h" #include "common/util/logging.h" +#include "common/util/system.h" namespace transformer_engine { @@ -393,6 +395,29 @@ void CheckOutputGroupedTensor(const GroupedTensor &t, std::string_view name, boo CheckGroupedTensorShapeArrays(t, name); } +namespace { + +constexpr size_t kDefaultTensorHandlePoolSizeMB = 20; +constexpr size_t kBytesPerMB = 1024 * 1024; + +size_t GetTensorHandlePoolSizeMB(const char *env_var) { + const size_t pool_size_mb = getenv(env_var, kDefaultTensorHandlePoolSizeMB); + NVTE_CHECK(pool_size_mb > 0, env_var, " must be a positive integer."); + NVTE_CHECK(pool_size_mb <= std::numeric_limits::max() / kBytesPerMB, env_var, + " is too large."); + return pool_size_mb; +} + +size_t GetTensorHandlePoolCapacity(size_t pool_size_mb, size_t handle_size, const char *handle_name, + const char *env_var) { + const size_t pool_size_bytes = pool_size_mb * kBytesPerMB; + NVTE_CHECK(pool_size_bytes >= handle_size, env_var, "=", pool_size_mb, + " MiB is too small for one ", handle_name, " handle of size ", handle_size, " bytes."); + return pool_size_bytes / handle_size; +} + +} // namespace + class TensorAllocator { public: static TensorAllocator &instance() { @@ -406,8 +431,10 @@ class TensorAllocator { std::lock_guard lock(mutex); const size_t available = free_list.size() + (memory.capacity() - memory.size()); NVTE_CHECK(available >= N, "Cannot allocate ", N, - " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, - ". There is probably a memory leak in your application."); + " new NVTETensors. Maximum number of tensors reached: ", MAX_TENSOR_NUM, " (", + TENSOR_HANDLE_POOL_SIZE_MB, + " MiB handle pool). If your application legitimately needs more tensor handles, " + "increase NVTE_TENSOR_HANDLE_POOL_SIZE_MB."); for (size_t i = 0; i < N; ++i) { uintptr_t index; if (!free_list.empty()) { @@ -479,9 +506,11 @@ class TensorAllocator { std::mutex mutex; std::atomic size; - // Allocate at most 20 MB for tensors // Should be replaced by virtual memory allocation - const size_t MAX_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(Tensor); + const size_t TENSOR_HANDLE_POOL_SIZE_MB = + GetTensorHandlePoolSizeMB("NVTE_TENSOR_HANDLE_POOL_SIZE_MB"); + const size_t MAX_TENSOR_NUM = GetTensorHandlePoolCapacity( + TENSOR_HANDLE_POOL_SIZE_MB, sizeof(Tensor), "NVTETensor", "NVTE_TENSOR_HANDLE_POOL_SIZE_MB"); std::vector free_list; std::vector memory; bool debug = false; @@ -532,7 +561,9 @@ class GroupedTensorAllocator { } NVTE_ERROR( "Cannot allocate a new NVTEGroupedTensor. Maximum number of grouped tensors reached: ", - MAX_GROUPED_TENSOR_NUM, ". There is probably a memory leak in your application."); + MAX_GROUPED_TENSOR_NUM, " (", GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, + " MiB handle pool). If your application legitimately needs more grouped tensor handles, " + "increase NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB."); } void Free(NVTEGroupedTensor t) { @@ -564,8 +595,11 @@ class GroupedTensorAllocator { std::mutex mutex; std::atomic size; - // Allocate at most 20 MB for grouped tensors - const size_t MAX_GROUPED_TENSOR_NUM = 20 * 1024 * 1024 / sizeof(GroupedTensor); + const size_t GROUPED_TENSOR_HANDLE_POOL_SIZE_MB = + GetTensorHandlePoolSizeMB("NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); + const size_t MAX_GROUPED_TENSOR_NUM = + GetTensorHandlePoolCapacity(GROUPED_TENSOR_HANDLE_POOL_SIZE_MB, sizeof(GroupedTensor), + "NVTEGroupedTensor", "NVTE_GROUPED_TENSOR_HANDLE_POOL_SIZE_MB"); std::vector free_list; std::vector memory; };