pytorch · Gasoonjia · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+# SlimTensor library for ExecuTorch CUDA backend A lightweight tensor
+# implementation for AOTI (Ahead-of-Time Inference)
+
+# C10 core headers
+set(SLIM_C10_HEADERS
+    c10/core/Device.h c10/core/DeviceType.h c10/Contiguity.h c10/MemoryFormat.h
+    c10/SizesAndStrides.h c10/WrapDimMinimal.h
+)
+
+# Utility headers
+set(SLIM_UTIL_HEADERS util/SharedPtr.h util/SizeUtil.h util/type_convert.h)
+
+# Core SlimTensor headers
+set(SLIM_CORE_HEADERS core/SlimTensor.h core/SlimTensorResize-incl.h
+                      core/SlimTensorView-incl.h core/Storage.h
+)
+
+# Factory headers
+set(SLIM_FACTORY_HEADERS factory/Empty.h factory/Factory.h factory/FromBlob.h
+                         factory/FromScalar.h factory/Pad.h
+)
+
+# CUDA headers
+set(SLIM_CUDA_HEADERS cuda/Exception.h cuda/Guard.h)
+
+# All headers combined
+set(SLIM_TENSOR_HEADERS
+    ${SLIM_C10_HEADERS} ${SLIM_UTIL_HEADERS} ${SLIM_CORE_HEADERS}
+    ${SLIM_FACTORY_HEADERS} ${SLIM_CUDA_HEADERS}
+)
+
+# Header-only interface library for SlimTensor
+add_library(slim_tensor INTERFACE)
+target_include_directories(
+  slim_tensor INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/../../../..
+)
+
+# Link to ExecuTorch dependencies
+target_link_libraries(
+  slim_tensor INTERFACE executorch_core extension_data_loader
+)
+
+# CUDA support (if available)
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(slim_tensor INTERFACE CUDA::cudart)
+endif()
+
+# Installation
+install(FILES ${SLIM_C10_HEADERS}
+        DESTINATION include/executorch/backends/aoti/slim/c10/core
+)
+install(FILES c10/Contiguity.h c10/MemoryFormat.h c10/SizesAndStrides.h
+              c10/WrapDimMinimal.h
+        DESTINATION include/executorch/backends/aoti/slim/c10
+)
+install(FILES ${SLIM_UTIL_HEADERS}
+        DESTINATION include/executorch/backends/aoti/slim/util
+)
+install(FILES ${SLIM_CORE_HEADERS}
+        DESTINATION include/executorch/backends/aoti/slim/core
+)
+install(FILES ${SLIM_FACTORY_HEADERS}
+        DESTINATION include/executorch/backends/aoti/slim/factory
+)
+install(FILES ${SLIM_CUDA_HEADERS}
+        DESTINATION include/executorch/backends/aoti/slim/cuda
+)
+
+# Tests (if building tests)
+if(EXECUTORCH_BUILD_TESTS)
+  enable_testing()
+
+  # Basic SlimTensor tests
+  add_executable(test_slim_tensor_basic tests/test_slim_tensor_basic.cpp)
+  target_link_libraries(
+    test_slim_tensor_basic PRIVATE slim_tensor gtest gtest_main
+  )
+  add_test(NAME test_slim_tensor_basic COMMAND test_slim_tensor_basic)
+
+  # Type conversion tests
+  add_executable(test_type_convert tests/test_type_convert.cpp)
+  target_link_libraries(test_type_convert PRIVATE slim_tensor gtest gtest_main)
+  add_test(NAME test_type_convert COMMAND test_type_convert)
+
+  # CUDA tests (if CUDA is enabled)
+  if(EXECUTORCH_BUILD_CUDA)
+    add_executable(test_slim_tensor_cuda tests/test_slim_tensor_cuda.cpp)
+    target_link_libraries(
+      test_slim_tensor_cuda PRIVATE slim_tensor gtest gtest_main CUDA::cudart
+    )
+    add_test(NAME test_slim_tensor_cuda COMMAND test_slim_tensor_cuda)
+  endif()
+endif()
diff --git a/backends/aoti/slim/TARGETS b/backends/aoti/slim/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/aoti/slim/c10/TARGETS b/backends/aoti/slim/c10/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/aoti/slim/c10/WrapDimMinimal.h b/backends/aoti/slim/c10/WrapDimMinimal.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <cstdint>
+#include <utility>
+
+// Different from the original implementation in c10, we don't need
+// to support SymInt here.
+namespace c10 {
+namespace detail {
+template <typename T>
+T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar);
+}
+
+template <typename T>
+T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) {
+  // Inline the fast paths
+  if (C10_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) {
+    // For SymInts, we want an explicit control flow to trigger a guard, so we
+    // may as well branch too.
+    if (dim < 0) {
+      return dim + dim_post_expr;
+    }
+    return dim;
+  }
+  // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
+  return c10::detail::maybe_wrap_dim_slow<T>(
+      std::move(dim), std::move(dim_post_expr), wrap_scalar);
+}
+
+inline int64_t
+maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar = true) {
+  return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
+}
+
+namespace detail {
+// This template can only be specialized at int64_t and c10::SymInt;
+// you'll get linker errors otherwise
+template <typename T>
+T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
+  ET_CHECK_MSG(
+      dim_post_expr >= 0,
+      "Rank cannot be negative but got %lld",
+      static_cast<long long>(dim_post_expr));
+
+  if (dim_post_expr == 0) {
+    ET_CHECK_MSG(
+        wrap_scalar,
+        "Dimension specified as %lld but tensor has no dimensions",
+        static_cast<long long>(dim));
+    return c10::maybe_wrap_dim(
+        std::move(dim), /*dim_post_expr=*/1, /*wrap_scalar=*/false);
+  }
+
+  T min = dim_post_expr * -1;
+  T max = dim_post_expr - 1;
+  ET_CHECK_MSG(
+      min <= dim && dim <= max,
+      "Dimension out of range (expected to be in range of [%lld"
+      ", %lld], but got %lld)",
+      static_cast<long long>(min),
+      static_cast<long long>(max),
+      static_cast<long long>(dim));
+
+  ET_DCHECK_MSG(
+      false, "should never reach here as dim should be out-of-bounds");
+  return dim; // unreachable, but needed to suppress compiler warnings
+}
+} // namespace detail
+} // namespace c10
diff --git a/backends/aoti/slim/c10/core/Contiguity.h b/backends/aoti/slim/c10/core/Contiguity.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <executorch/backends/aoti/slim/c10/util/ArrayRef.h>
+#include <executorch/backends/aoti/slim/c10/util/irange.h>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace standalone::c10 {
+
+template <typename T>
+bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (numel == 0) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (size_d == 1) {
+      continue;
+    }
+
+    if (strides[d] != expected_stride) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+// This function will return True if the tensor is contiguous, and False if the
+// its not or if we can't determine if it is contiguous due to unbacked symbols
+// (it could be either in that case based on the actual runtime data).
+template <typename T>
+bool definitely_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (numel == 0) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (size_d == 1) {
+      continue;
+    }
+
+    if (strides[d] != expected_stride) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+template <typename T>
+bool _compute_channels_last_contiguous_2d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 4: {
+      T expected = 1;
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (size_d != 1) {
+          if (strides[d] != expected) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+bool _compute_channels_last_contiguous_3d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 5: {
+      T expected = 1;
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (size_d != 1) {
+          if (strides[d] != expected) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+bool _compute_non_overlapping_and_dense(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  auto dim = sizes.size();
+  if (dim == 1) {
+    return sizes[0] < 2 || strides[0] == 1;
+  }
+  std::vector<int64_t> perm(dim);
+  for (const auto i : irange(dim)) {
+    perm[i] = i;
+  }
+  // Sort by strides, leaving 0 and 1 sized dims at the end of the array
+  std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
+    if (sizes[a] < 2) {
+      return false;
+    } else if (sizes[b] < 2) {
+      return true;
+    }
+    return strides[a] < strides[b];
+  });
+  T require_stride = 1;
+  for (const auto i : irange(dim)) {
+    const auto& size_perm_i = sizes[perm[i]];
+    if (size_perm_i < 2) {
+      return true;
+    }
+    if (strides[perm[i]] != require_stride) {
+      return false;
+    }
+    require_stride *= size_perm_i;
+  }
+  return true;
+}
+
+} // namespace standalone::c10