Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions backends/aoti/slim/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.19)

# SlimTensor library for ExecuTorch CUDA backend A lightweight tensor
# implementation for AOTI (Ahead-of-Time Inference)

# C10 core headers
set(SLIM_C10_HEADERS
c10/core/Device.h c10/core/DeviceType.h c10/Contiguity.h c10/MemoryFormat.h
c10/SizesAndStrides.h c10/WrapDimMinimal.h
)

# Utility headers
set(SLIM_UTIL_HEADERS util/SharedPtr.h util/SizeUtil.h util/type_convert.h)

# Core SlimTensor headers
set(SLIM_CORE_HEADERS core/SlimTensor.h core/SlimTensorResize-incl.h
core/SlimTensorView-incl.h core/Storage.h
)

# Factory headers
set(SLIM_FACTORY_HEADERS factory/Empty.h factory/Factory.h factory/FromBlob.h
factory/FromScalar.h factory/Pad.h
)

# CUDA headers
set(SLIM_CUDA_HEADERS cuda/Exception.h cuda/Guard.h)

# All headers combined
set(SLIM_TENSOR_HEADERS
${SLIM_C10_HEADERS} ${SLIM_UTIL_HEADERS} ${SLIM_CORE_HEADERS}
${SLIM_FACTORY_HEADERS} ${SLIM_CUDA_HEADERS}
)

# Header-only interface library for SlimTensor
add_library(slim_tensor INTERFACE)
target_include_directories(
slim_tensor INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/../../../..
)

# Link to ExecuTorch dependencies
target_link_libraries(
slim_tensor INTERFACE executorch_core extension_data_loader
)

# CUDA support (if available)
if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(slim_tensor INTERFACE CUDA::cudart)
endif()

# Installation
install(FILES ${SLIM_C10_HEADERS}
DESTINATION include/executorch/backends/aoti/slim/c10/core
)
install(FILES c10/Contiguity.h c10/MemoryFormat.h c10/SizesAndStrides.h
c10/WrapDimMinimal.h
DESTINATION include/executorch/backends/aoti/slim/c10
)
install(FILES ${SLIM_UTIL_HEADERS}
DESTINATION include/executorch/backends/aoti/slim/util
)
install(FILES ${SLIM_CORE_HEADERS}
DESTINATION include/executorch/backends/aoti/slim/core
)
install(FILES ${SLIM_FACTORY_HEADERS}
DESTINATION include/executorch/backends/aoti/slim/factory
)
install(FILES ${SLIM_CUDA_HEADERS}
DESTINATION include/executorch/backends/aoti/slim/cuda
)

# Tests (if building tests)
if(EXECUTORCH_BUILD_TESTS)
enable_testing()

# Basic SlimTensor tests
add_executable(test_slim_tensor_basic tests/test_slim_tensor_basic.cpp)
target_link_libraries(
test_slim_tensor_basic PRIVATE slim_tensor gtest gtest_main
)
add_test(NAME test_slim_tensor_basic COMMAND test_slim_tensor_basic)

# Type conversion tests
add_executable(test_type_convert tests/test_type_convert.cpp)
target_link_libraries(test_type_convert PRIVATE slim_tensor gtest gtest_main)
add_test(NAME test_type_convert COMMAND test_type_convert)

# CUDA tests (if CUDA is enabled)
if(EXECUTORCH_BUILD_CUDA)
add_executable(test_slim_tensor_cuda tests/test_slim_tensor_cuda.cpp)
target_link_libraries(
test_slim_tensor_cuda PRIVATE slim_tensor gtest gtest_main CUDA::cudart
)
add_test(NAME test_slim_tensor_cuda COMMAND test_slim_tensor_cuda)
endif()
endif()
5 changes: 5 additions & 0 deletions backends/aoti/slim/TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
5 changes: 5 additions & 0 deletions backends/aoti/slim/c10/TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
80 changes: 80 additions & 0 deletions backends/aoti/slim/c10/WrapDimMinimal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <c10/macros/Macros.h>
#include <executorch/runtime/platform/assert.h>

#include <cstdint>
#include <utility>

// Different from the original implementation in c10, we don't need
// to support SymInt here.
namespace c10 {
namespace detail {
template <typename T>
T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar);
}

template <typename T>
T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) {
// Inline the fast paths
if (C10_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) {
// For SymInts, we want an explicit control flow to trigger a guard, so we
// may as well branch too.
if (dim < 0) {
return dim + dim_post_expr;
}
return dim;
}
// Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
return c10::detail::maybe_wrap_dim_slow<T>(
std::move(dim), std::move(dim_post_expr), wrap_scalar);
}

inline int64_t
maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar = true) {
return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
}

namespace detail {
// This template can only be specialized at int64_t and c10::SymInt;
// you'll get linker errors otherwise
template <typename T>
T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
ET_CHECK_MSG(
dim_post_expr >= 0,
"Rank cannot be negative but got %lld",
static_cast<long long>(dim_post_expr));

if (dim_post_expr == 0) {
ET_CHECK_MSG(
wrap_scalar,
"Dimension specified as %lld but tensor has no dimensions",
static_cast<long long>(dim));
return c10::maybe_wrap_dim(
std::move(dim), /*dim_post_expr=*/1, /*wrap_scalar=*/false);
}

T min = dim_post_expr * -1;
T max = dim_post_expr - 1;
ET_CHECK_MSG(
min <= dim && dim <= max,
"Dimension out of range (expected to be in range of [%lld"
", %lld], but got %lld)",
static_cast<long long>(min),
static_cast<long long>(max),
static_cast<long long>(dim));

ET_DCHECK_MSG(
false, "should never reach here as dim should be out-of-bounds");
return dim; // unreachable, but needed to suppress compiler warnings
}
} // namespace detail
} // namespace c10
151 changes: 151 additions & 0 deletions backends/aoti/slim/c10/core/Contiguity.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#pragma once

#include <executorch/backends/aoti/slim/c10/util/ArrayRef.h>
#include <executorch/backends/aoti/slim/c10/util/irange.h>

#include <algorithm>
#include <cstdint>

namespace standalone::c10 {

template <typename T>
bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
if (numel == 0) {
return true;
}

T expected_stride = 1;
// NB: make sure we do signed arithmetic
for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
const auto& size_d = sizes[d];
if (size_d == 1) {
continue;
}

if (strides[d] != expected_stride) {
return false;
}
expected_stride *= size_d;
}
return true;
}

// This function will return True if the tensor is contiguous, and False if the
// its not or if we can't determine if it is contiguous due to unbacked symbols
// (it could be either in that case based on the actual runtime data).
template <typename T>
bool definitely_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
if (numel == 0) {
return true;
}

T expected_stride = 1;
// NB: make sure we do signed arithmetic
for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
const auto& size_d = sizes[d];
if (size_d == 1) {
continue;
}

if (strides[d] != expected_stride) {
return false;
}
expected_stride *= size_d;
}
return true;
}

template <typename T>
bool _compute_channels_last_contiguous_2d(
ArrayRef<T> sizes,
ArrayRef<T> strides) {
// Please don't combine these code, constant array is used here to let
// compiler fully unroll the loop to get better performance
switch (sizes.size()) {
case 4: {
T expected = 1;
for (auto& d : {1, 3, 2, 0}) {
const auto& size_d = sizes[d];
if (size_d != 1) {
if (strides[d] != expected) {
return false;
}
expected *= size_d;
}
}
return true;
}
// NOLINTNEXTLINE(bugprone-branch-clone)
case 3:
// TODO dim == 3 case will be enabled once it is fully tested
return false;
default:
return false;
}
}

template <typename T>
bool _compute_channels_last_contiguous_3d(
ArrayRef<T> sizes,
ArrayRef<T> strides) {
// Please don't combine these code, constant array is used here to let
// compiler fully unroll the loop to get better performance
switch (sizes.size()) {
case 5: {
T expected = 1;
for (auto& d : {1, 4, 3, 2, 0}) {
const auto& size_d = sizes[d];
if (size_d != 1) {
if (strides[d] != expected) {
return false;
}
expected *= size_d;
}
}
return true;
}
// NOLINTNEXTLINE(bugprone-branch-clone)
case 4:
// TODO dim == 4 case will be enabled once it is fully tested
return false;
default:
return false;
}
}

template <typename T>
bool _compute_non_overlapping_and_dense(
ArrayRef<T> sizes,
ArrayRef<T> strides) {
auto dim = sizes.size();
if (dim == 1) {
return sizes[0] < 2 || strides[0] == 1;
}
std::vector<int64_t> perm(dim);
for (const auto i : irange(dim)) {
perm[i] = i;
}
// Sort by strides, leaving 0 and 1 sized dims at the end of the array
std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
if (sizes[a] < 2) {
return false;
} else if (sizes[b] < 2) {
return true;
}
return strides[a] < strides[b];
});
T require_stride = 1;
for (const auto i : irange(dim)) {
const auto& size_perm_i = sizes[perm[i]];
if (size_perm_i < 2) {
return true;
}
if (strides[perm[i]] != require_stride) {
return false;
}
require_stride *= size_perm_i;
}
return true;
}

} // namespace standalone::c10
Loading
Loading