diff --git a/.clang-format-for-format-sh b/.clang-format-for-format-sh
index 17e9f8935d..0da534af8b 100644
--- a/.clang-format-for-format-sh
+++ b/.clang-format-for-format-sh
@@ -133,7 +133,7 @@ PointerAlignment: Right
 PPIndentWidth:   -1
 QualifierAlignment: Right
 ReferenceAlignment: Pointer
-ReflowComments:  true
+ReflowComments:  false
 ShortNamespaceLines: 1
 SortIncludes:    CaseSensitive
 SortJavaStaticImport: Before
diff --git a/.editorconfig b/.editorconfig
index 71bcacde7d..f339516d70 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -4,6 +4,7 @@ root = true
 [*]
 end_of_line = lf
 insert_final_newline = true
+trim_trailing_whitespace = true
 
 [{CMakeLists.txt,*.cmake}]
 indent_style = space
@@ -24,3 +25,7 @@ indent_size = 2
 [*.md]
 indent_style = space
 indent_size = 2
+
+[*.dox]
+indent_style = space
+indent_size = 2
diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
deleted file mode 100644
index 361a66c4ff..0000000000
--- a/.flake/pkgs/legion.nix
+++ /dev/null
@@ -1,48 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitLab
-, cmake
-, cudaPackages ? { }
-, cudaCapabilities ? [ "60" "70" "80" "86" ]
-, maxDim ? 5
-}:
-
-# from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix
-
-let 
-  cmakeFlag = x: if x then "1" else "0";
-
-  inherit (cudaPackages) cudatoolkit;
-in
-
-stdenv.mkDerivation rec {
-  pname = "legion";
-  version = "2025-01-06";
-
-  src = fetchFromGitLab {
-    owner = "StanfordLegion";
-    repo = "legion";
-    rev = "7be1abd0207eb1126c7629b16d1123fa6f58ce9d";
-    sha256 = "sha256-gTjnGYYTQwTsrV1WcY0qqpTrlwbzAPcndurRy6XnG8A=";
-  };
-
-  nativeBuildInputs = [
-    cmake
-  ];
-
-  cmakeFlags = [
-    "-DLegion_USE_CUDA=1"
-    "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
-    "-DLegion_MAX_DIM=${toString maxDim}"
-  ];
-
-  buildInputs = [ 
-    cudatoolkit
-  ];
-
-  meta = with lib; {
-    description = "Legion is a parallel programming model for distributed, heterogeneous machines";
-    homepage = "https://legion.stanford.edu/";
-    license = licenses.asl20;
-  };
-}
diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
new file mode 100644
index 0000000000..336b1c050c
--- /dev/null
+++ b/.flake/pkgs/realm.nix
@@ -0,0 +1,46 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, cmake
+, cudaPackages ? { }
+, zlib
+, maxDim ? 5
+}:
+
+let
+  inherit (cudaPackages) cudatoolkit;
+in
+
+stdenv.mkDerivation rec {
+  pname = "realm";
+  version = "2026-02-24";
+
+  src = fetchFromGitHub {
+    owner = "StanfordLegion";
+    repo = "realm";
+    rev = "42f7484a80e0bdacaf47d9a758822f5327348dd0";
+    sha256 = "sha256-IHiokPmTjEV5df3fr1Xubuyt2N1CFI2fA7Q2TsbxS3Y=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+  ];
+
+  cmakeFlags = [
+    "-DBUILD_SHARED_LIBS=ON"
+    "-DREALM_ENABLE_CUDA=ON"
+    "-DREALM_ENABLE_PREALM=ON"
+    "-DREALM_MAX_DIM=${toString maxDim}"
+  ];
+
+  buildInputs = [
+    cudatoolkit
+    zlib
+  ];
+
+  meta = with lib; {
+    description = "Realm is a distributed, event–based tasking runtime for building high-performance applications that span clusters of CPUs, GPUs, and other accelerators";
+    homepage = "https://legion.stanford.edu/realm";
+    license = licenses.asl20;
+  };
+}
diff --git a/.proj.toml b/.proj.toml
index 38690f710b..3d78c9ae82 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,6 +2,7 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+doxygen = true
 cuda_launch_cmd = [
   "nixGL",
   "--",
@@ -85,6 +86,13 @@ has-cpu-only-benchmarks = false
 has-cuda-tests = true
 has-cuda-benchmarks = false
 
+[targets.realm-execution]
+type = "lib"
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
+
 # [targets.local-pcg-execution]
 # type = "lib"
 # has-cpu-only-tests = true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc1a296dbe..4723a3168d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING
 set(FF_MAX_NUM_TASK_ARGUMENTS "5" CACHE STRING
   "Maximum number of arguments that can be declared in a TaskSignature")
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
+option(FF_USE_PREALM "Build with PRealm profiling interface" ON)
 option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
 option(FF_USE_PYTHON "Enable Python" ON)
 option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index ac19f9011e..6855537460 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -7,11 +7,7 @@ if(FF_BUILD_VISUALIZATION_TOOL)
 endif()
 
 if(FF_BUILD_SP_IZATION_BENCHMARKING)
-  add_subdirectory(sp_ization_benchmarking)
-endif()
-
-if(FF_BUILD_ARG_PARSER)
-  add_subdirectory(arg_parser)
+  add_subdirectory(sp-ization-benchmarking)
 endif()
 
 if(FF_BUILD_BIN_EXPORT_MODEL_ARCH)
diff --git a/bin/README.md b/bin/README.md
deleted file mode 100644
index d0b8ccd018..0000000000
--- a/bin/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# bin
-
-This directory contains command-line interfaces for FlexFlow Train and associated tools (all in C++). 
-A short description of each is included below--more information can be found in the `README.md` files
-in each of the corresponding directories (e.g., [here](./export-model-arch/README.md) for `export-model-arch`):
-
-- `export-model-arch`: Exports the model computation graphs defined in the [models](../lib/models/) library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
-- `substitution-to-dot`: Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
-- `protobuf-to-json`: Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
diff --git a/bin/export-model-arch/README.md b/bin/export-model-arch/README.md
deleted file mode 100644
index 80b6c3ef04..0000000000
--- a/bin/export-model-arch/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# export-model-arch
-
-A tool for exporting and visualizing the model computation graphs defined in [models](../lib/models).
-To build and run `export-model-arch`, run the following commands from the root of the FlexFlow Train repository:
-```console
-$ proj cmake # if you haven't already
-...
-$ proj build
-...
-$ ./build/normal/bin/export-model-arch/export-model-arch -h
-```
-The above should print the help message for `export-model-arch`. A few example commands are also listed below:
-
-- Export the `split_test` model in JSON (e.g., for processing outside of FlexFlow Train):
-```console
-$ ./build/normal/bin/export-model-arch/export-model-arch split_test
-```
-
-- Export the `split_test` model in JSON along with the SP decomposition of the model's computation graph:
-```console
-$ ./build/normal/bin/export-model-arch/export-model-arch --sp-decomposition split_test
-```
-
-- Export the `split_test` model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
-```console
-$ ./build/normal/bin/export-model-arch/export-model-arch --dot split_test
-```
diff --git a/bin/export-model-arch/index.dox b/bin/export-model-arch/index.dox
new file mode 100644
index 0000000000..349ed2209c
--- /dev/null
+++ b/bin/export-model-arch/index.dox
@@ -0,0 +1,37 @@
+namespace FlexFlow {
+/**
+
+@page export-model-arch export-model-arch
+
+\brief Exports the model computation graphs defined in the @ref models library as either JSON (for use outside of FlexFlow) or as DOT (for visualization). Can also optionally export the SP decompositions of the computation graphs.
+
+A tool for exporting (for details of the file format, see \ref file-format) and visualizing the model ComputationGraphs defined in @ref models.
+To build and run \c export-model-arch, run the following commands from the root of the %FlexFlow %Train repository:
+
+\verbatim
+$ proj cmake # if you haven't already
+...
+$ proj build
+...
+$ ./build/normal/bin/export-model-arch/export-model-arch -h
+\endverbatim
+
+The above should print the help message for `export-model-arch`. A few example commands are also listed below:
+
+- Export the \ref split-test model in JSON (e.g., for processing outside of %FlexFlow %Train):
+\verbatim
+$ ./build/normal/bin/export-model-arch/export-model-arch split_test
+\endverbatim
+
+- Export the \ref split-test model in JSON along with the SP decomposition of the model's ComputationGraph:
+\verbatim
+$ ./build/normal/bin/export-model-arch/export-model-arch --sp-decomposition split_test
+\endverbatim
+
+- Export the \ref split-test model as DOT (e.g., for visualization using a [local](https://github.com/jrfonseca/xdot.py) or [web-based](https://dreampuf.github.io/GraphvizOnline/) DOT viewer)
+\verbatim
+$ ./build/normal/bin/export-model-arch/export-model-arch --dot split_test
+\endverbatim
+
+*/
+}
diff --git a/bin/export-model-arch/src/export-model-arch/main.cc b/bin/export-model-arch/src/export-model-arch/main.cc
index 29be28b0ef..e62809dda5 100644
--- a/bin/export-model-arch/src/export-model-arch/main.cc
+++ b/bin/export-model-arch/src/export-model-arch/main.cc
@@ -118,6 +118,7 @@ tl::expected<JsonSPModelExport, std::string>
 }
 
 int main(int argc, char **argv) {
+  //! [utils/cli example]
   CLISpec cli = empty_cli_spec();
 
   CLIArgumentKey arg_key_help = cli_add_help_flag(cli);
@@ -182,6 +183,7 @@ int main(int argc, char **argv) {
   bool sp_decompositition = cli_get_flag(parsed, key_sp_decomposition);
   bool dot = cli_get_flag(parsed, key_dot);
   bool preprocessed_dot = cli_get_flag(parsed, key_preprocessed_dot);
+  //! [utils/cli example]
 
   auto handle_error = [](auto const &result) {
     if (!result.has_value()) {
diff --git a/bin/index.dox b/bin/index.dox
new file mode 100644
index 0000000000..4944e50067
--- /dev/null
+++ b/bin/index.dox
@@ -0,0 +1,12 @@
+/**
+
+\page bin bin
+
+This directory contains command-line interfaces for %FlexFlow %Train and associated tools (all in C++).
+
+- \subpage export-model-arch "": \copybrief export-model-arch
+- \subpage protobuf-to-json "": \copybrief protobuf-to-json
+- \subpage sp-ization-benchmarking "": \copybrief sp-ization-benchmarking
+- \subpage substitution-to-dot "": \copybrief substitution-to-dot
+
+*/
diff --git a/bin/protobuf-to-json/README.md b/bin/protobuf-to-json/README.md
deleted file mode 100644
index a1b1406e8b..0000000000
--- a/bin/protobuf-to-json/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# protobuf-to-json
-
-TODO
diff --git a/bin/protobuf-to-json/index.dox b/bin/protobuf-to-json/index.dox
new file mode 100644
index 0000000000..a49b0fbbd3
--- /dev/null
+++ b/bin/protobuf-to-json/index.dox
@@ -0,0 +1,12 @@
+namespace FlexFlow {
+/**
+
+\page protobuf-to-json
+
+\brief Converts TASO-generated substitutions from the legacy protobuf format ([example](../substitutions/graph_subst_3_v2.pb)) to the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)). Will be removed in the future once the substitution generator is integrated natively into FlexFlow Train (tracked in [#351](https://github.com/flexflow/flexflow-train/issues/351)).
+
+\todo
+  \@lockshaw Add docs and example (s) for protobuf-to-json. See \ref export-model-arch for an example.
+
+*/
+}
diff --git a/bin/sp_ization_benchmarking/CMakeLists.txt b/bin/sp-ization-benchmarking/CMakeLists.txt
similarity index 100%
rename from bin/sp_ization_benchmarking/CMakeLists.txt
rename to bin/sp-ization-benchmarking/CMakeLists.txt
diff --git a/bin/sp_ization_benchmarking/distributions.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/distributions.h
similarity index 100%
rename from bin/sp_ization_benchmarking/distributions.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/distributions.h
diff --git a/bin/sp_ization_benchmarking/nasnet_bench_graph_generator.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
similarity index 100%
rename from bin/sp_ization_benchmarking/nasnet_bench_graph_generator.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/nasnet_bench_graph_generator.h
diff --git a/bin/sp_ization_benchmarking/sample_graphs.h b/bin/sp-ization-benchmarking/include/sp-ization-benchmarking/sample_graphs.h
similarity index 100%
rename from bin/sp_ization_benchmarking/sample_graphs.h
rename to bin/sp-ization-benchmarking/include/sp-ization-benchmarking/sample_graphs.h
diff --git a/bin/sp-ization-benchmarking/index.dox b/bin/sp-ization-benchmarking/index.dox
new file mode 100644
index 0000000000..9af57a7ff3
--- /dev/null
+++ b/bin/sp-ization-benchmarking/index.dox
@@ -0,0 +1,12 @@
+namespace FlexFlow {
+/**
+
+\page sp-ization-benchmarking
+
+\brief Executes evaluations for the various SP-ization algorithms in \ref spization.
+
+\todo
+  \@pietro Add usage docs and example(s) for sp-ization-benchmarking. See \ref export-model-arch for an example.
+
+*/
+}
diff --git a/bin/sp_ization_benchmarking/distributions.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/distributions.cc
similarity index 100%
rename from bin/sp_ization_benchmarking/distributions.cc
rename to bin/sp-ization-benchmarking/src/sp-ization-benchmarking/distributions.cc
diff --git a/bin/sp_ization_benchmarking/sp_ization_benchmarking.cc b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
similarity index 99%
rename from bin/sp_ization_benchmarking/sp_ization_benchmarking.cc
rename to bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
index bc98a3a606..933ae535db 100644
--- a/bin/sp_ization_benchmarking/sp_ization_benchmarking.cc
+++ b/bin/sp-ization-benchmarking/src/sp-ization-benchmarking/main.cc
@@ -1,5 +1,5 @@
 /**
- * @file sp_ization_benchmarking.cpp
+ * @file main.cc
  * @brief Benchmarking different SP-ization techniques on various graphs.
  *
  * @details
@@ -22,9 +22,9 @@
  * run make and then ./sp_ization_benchmarking
  */
 
-#include "distributions.h"
-#include "nasnet_bench_graph_generator.h"
-#include "sample_graphs.h"
+#include "sp-ization-benchmarking/distributions.h"
+#include "sp-ization-benchmarking/nasnet_bench_graph_generator.h"
+#include "sp-ization-benchmarking/sample_graphs.h"
 #include "utils/graph/digraph/algorithms/transitive_reduction.h"
 #include "utils/graph/digraph/digraph_view.h"
 #include "utils/graph/node/algorithms.h"
diff --git a/bin/substitution-to-dot/README.md b/bin/substitution-to-dot/README.md
deleted file mode 100644
index 931c3cbdd3..0000000000
--- a/bin/substitution-to-dot/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# substitution-to-dot
-
-TODO
diff --git a/bin/substitution-to-dot/index.dox b/bin/substitution-to-dot/index.dox
new file mode 100644
index 0000000000..15ee221f93
--- /dev/null
+++ b/bin/substitution-to-dot/index.dox
@@ -0,0 +1,12 @@
+namespace FlexFlow {
+/**
+
+\page substitution-to-dot
+
+\brief Converts TASO-generated substitutions from the legacy JSON format ([example](../substitutions/graph_subst_3_v2.json)) into DOT for visualization.
+
+\todo
+  \@lockshaw, Add usage docs and example(s) for substitution-to-dot. See \ref export-model-arch for an example.
+
+*/
+}
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index ef5d6d9d11..795668e32a 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -17,6 +17,7 @@ function(define_ff_vars target)
     MAX_NUM_FUSED_TENSORS=${FF_MAX_NUM_FUSED_TENSORS}
     MAX_NUM_WORKERS=${FF_MAX_NUM_WORKERS}
     FF_USE_NCCL=${FF_USE_NCCL}
+    FF_USE_PREALM=${FF_USE_PREALM}
     MAX_TENSOR_DIM=${FF_MAX_DIM}
     MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
     MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
diff --git a/contributing.dox b/contributing.dox
new file mode 100644
index 0000000000..b86be494c5
--- /dev/null
+++ b/contributing.dox
@@ -0,0 +1,213 @@
+namespace FlexFlow {
+/**
+
+\page contributing Developers Guide
+
+\section contributing-setup Setup
+
+\note If you are developing on Stanford's Sapling cluster, instead see the instructions \subpage sapling-setup "here". If you don't know what this means, you're not using Sapling so you should just continue reading.
+
+1. %FlexFlow %Train uses <a href="https://nix.dev/manual/nix/2.24/">Nix</a> to manage dependencies and the development environment.
+   There exist a number of ways to install Nix, but we recommend one of the following:
+
+   1. If you have root permissions: [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)
+
+   2. If you don't have root permissions: [DavHau/nix-portable](https://github.com/DavHau/nix-portable).
+      Note that nix-portable does not work particularly well if the Nix store is in <a href="https://en.wikipedia.org/wiki/Network_File_System">NFS</a> or other distributed file systems,
+      so if you are running on an HPC cluster where the home directory is mounted via a distributed file system we recommend setting the
+      <tt>NP_LOCATION</tt> environment to <tt>/tmp</tt> or some other non-NFS location.
+
+      While you should at least skim nix-portable's setup instructions, you'll probably end up doing something like this:
+
+      \verbatim
+      $ USERBIN="${XDG_BIN_HOME:-$HOME/.local/bin}"
+      $ wget 'https://github.com/DavHau/nix-portable/releases/download/v010/nix-portable' -O "$USERBIN/nix-portable"
+      ...
+      $ chmod u+x "$USERBIN/nix-portable"
+      ...
+      $ ln -sf "$USERBIN/nix-portable" "$USERBIN/nix"
+      ...
+      $ echo 'export PATH=$USERBIN:$PATH' >> ~/.bashrc
+      ...
+      \endverbatim
+
+      Now if everything is setup properly, you should be able to see something like the following (don't worry if the version number is slightly different) if you run <tt>nix \--version</tt>:
+
+      \verbatim
+      $ nix --version
+      nix (Nix) 2.20.6
+      \endverbatim
+
+2. Clone the FlexFlow Train repository
+
+\verbatim
+$ FF_DIR="$HOME/flexflow-train" # or wherever else you want to put the repository
+$ git clone --recursive git@github.com:flexflow/flexflow-train.git "$FF_DIR"
+...
+\endverbatim
+
+3. Enter the nix-provided `default` development environment (aka "dev shell")
+
+\verbatim
+$ cd "$FF_DIR"
+$ nix develop --accept-flake-config
+\endverbatim
+
+4. Build and run the non-GPU-required tests (systems that have access to CUDA GPUs can also run the GPU-mandatory tests by following the instructions \ref contributing-gpu-setup "here")
+
+\verbatim
+(ff) $ proj cmake
+...
+(ff) $ proj test --skip-gpu-tests
+...
+\endverbatim
+
+If everything is correctly configured, you should see a bunch of build messages followed by something like
+
+\verbatim
+(ff) $ proj test --skip-gpu-tests
+421/421 Test #441: get_transformer_computation_graph
+100% tests passed, 0 tests failed out of 421
+
+Label Time Summary:
+compiler-tests                  =   6.13 sec*proc (19 tests)
+local-execution-tests           =   0.13 sec*proc (3 tests)
+models-tests                    =   0.05 sec*proc (4 tests)
+op-attrs-tests                  =   0.48 sec*proc (59 tests)
+pcg-tests                       =   0.33 sec*proc (33 tests)
+substitution-generator-tests    =   0.06 sec*proc (2 tests)
+substitutions-tests             =   0.10 sec*proc (9 tests)
+utils-tests                     =   1.20 sec*proc (293 tests)
+
+Total Test time (real) =   8.64 sec
+\endverbatim
+
+If you don't, or if you see any tests failing, please double check that you have followed the instructions above.
+If you have and are still encountering an issue, please \ref contributing-contact-us "contact us" with a detailed description of your platform and the commands you have run.
+
+\subsection contributing-editorconfig EditorConfig
+
+FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
+The EditorConfig file for %FlexFlow %Train can be found in [`.editorconfig`](./.editorconfig).
+If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed))
+the configuration will be detected and applied without you needing to do anything.
+If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins).
+<b>If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).</b>
+
+\subsection contributing-gpu-setup GPU Setup
+
+If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell:
+
+\verbatim
+$ NIXPKGS_ALLOW_UNFREE=1 nix develop .#gpu --accept-flake-config --impure
+\endverbatim
+
+and then running
+
+\verbatim
+(ff) $ proj test
+...
+\endverbatim
+
+You should see the additional GPU tests run. If you instead see a message like
+
+> `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
+
+Double check that you are correctly in the `gpu` devshell, not the `default` devshell.
+If you've confirmed that you are in the correct devshell and are still encountering issues, \ref contributing-contact-us "contact us" with a detailed description of your platform and the commands you have run.
+
+\subsection contributing-nix-direnv nix-direnv (optional)
+
+If you installed Nix system-wide (e.g., using [DeterminateSystems/nix-installer](https://github.com/DeterminateSystems/nix-installer)),
+you can use [direnv](https://direnv.net/) to automatically enter the %FlexFlow %Train development environment when you `cd` into the repository, rather
+than having to manually run `nix develop`.
+[direnv](https://direnv.net) will also automatically exit the environment when you `cd` out of the repository, and (if configured using [nix-direnv](https://github.com/nix-community/nix-direnv)) will even automatically reload the environment if the `flake.nix` file changes.
+You can find the installation instructions for direnv [here](https://direnv.net/docs/installation.html), and if you would like automatic environment reloading you can also install nix-direnv using the instructions [here](https://github.com/nix-community/nix-direnv?tab=readme-ov-file#installation).
+
+Once you have direnv (and optionally nix-direnv) installed, cd into the root of your cloned %FlexFlow %Train repository and run
+
+\verbatim
+$ echo 'use flake . --accept-flake-config' > .envrc
+\endverbatim
+
+You should see a message that the `.envrc` file you just created is blocked.
+Run the command shown in the error message (i.e., `direnv allow`), and direnv should automatically place you in the environment.
+For more information on using direnv with nix, see [here](https://github.com/direnv/direnv/wiki/Nix).
+
+\section contributing-proj Building, Testing, etc.
+
+Most operations you'll want to perform while developing %FlexFlow %Train are provided through a small python utility called [proj](https://github.com/lockshaw/proj).
+`proj` is automatically pulled in by nix when you enter the dev shell, so you should be able to run
+
+\verbatim
+(ff) $ proj -h
+\endverbatim
+
+and see the full list of operations that `proj` supports.
+`proj` commands can be run from anywhere in the repository (i.e., they do not have to be run from the root).
+To help you get started, however, a list of common command invocations is included here:
+
+- To build %FlexFlow %Train:
+  \verbatim
+  (ff) $ proj build
+  \endverbatim
+- To build and run %FlexFlow %Train tests (without a GPU):
+  \verbatim
+  (ff) $ proj test --skip-gpu-tests
+  \endverbatim
+- To build and run %FlexFlow %Train tests (with a GPU):
+  \verbatim
+  (ff) $ proj test
+  \endverbatim
+- To regenerate CMake files (necessary anytime you switch branches or modify the CMake source. If you're ever running into weird build issues, try running this and see if it fixes things):
+  \verbatim
+  (ff) $ proj cmake
+  \endverbatim
+- To format all of the %FlexFlow %Train sources files:
+  \verbatim
+  (ff) $ proj format
+  \endverbatim
+- To build the %FlexFlow %Train docs:
+  \verbatim
+  (ff) $ proj doxygen
+  \endverbatim
+  You can also add the `--browser` command to automatically open the built docs in your default browser if you are working on your local machine.
+
+\section contributing-ci Continuous Integration
+
+We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
+
+1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see \ref contributing-proj) for more information on `proj`).
+2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
+
+GPU machines for CI are managed using [runs-on](https://runs-on.com/).
+
+\section contributing-contributing Contributing to FlexFlow
+
+We actively welcome your pull requests. Note that we may already be working on the feature/fix you're looking for, so we suggest searching through the [open issues](https://github.com/flexflow/flexflow-train/issues), [open PRs](https://github.com/flexflow/flexflow-train/pulls), and \ref contributing-contact-us "contacting us" to make sure you're not duplicating existing effort!
+
+The steps for getting changes merged into %FlexFlow are relatively standard:
+
+1. [Fork the repo](https://github.com/flexflow/flexflow-train/fork) and either create a new branch based on `master`, or just modify `master` directly.
+2. If you've added code that should be tested, add tests. The process for adding tests for code under `lib` is documented [here](./lib/README.md#tests). Adding tests for other parts of the code is currently undocumented, so you will \ref contributing-contact-us "contact us" for information on how to do it.
+3. Ensure the code builds (i.e., run `proj build`).
+4. Ensure the test suite passes (i.e., run `proj test`).
+5. Format the code (i.e., run `proj format`).
+6. Create a new PR from your modified branch to the `master` branch in %FlexFlow %Train.
+   Provide a brief description of the changes you've made and link any related/closed issues.
+
+Code review is done using [Reviewable](https://reviewable.io/).
+If you haven't used Reviewable before, please read through (or at least skim) the ["Reviews" section](https://docs.reviewable.io/reviews.html) of the Reviewable documentation.
+
+\section contributing-contact-us Contact Us
+
+Either [create an issue](https://github.com/flexflow/flexflow-train/issues/new) or join the %FlexFlow [Zulip](https://flexflow.zulipchat.com/join/mtiwtwttgggnivrkb6vlakbr/) instance.
+For any reported bugs, please ensure that your description clear and has sufficient information for us to reproduce the issue.
+
+\section contributing-license License
+
+By contributing to %FlexFlow %Train, you agree that your contributions will be licensed
+under the [LICENSE](./LICENSE) file in the root directory of this source tree.
+
+*/
+}
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 32b8da3828..933fc234f5 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -42,7 +42,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = FlexFlow
+PROJECT_NAME           = FlexFlow Train
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -372,7 +372,7 @@ TOC_INCLUDE_HEADINGS   = 5
 # The default value is: DOXYGEN.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
-MARKDOWN_ID_STYLE      = DOXYGEN
+MARKDOWN_ID_STYLE      = GITHUB
 
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
@@ -380,7 +380,7 @@ MARKDOWN_ID_STYLE      = DOXYGEN
 # globally by setting AUTOLINK_SUPPORT to NO.
 # The default value is: YES.
 
-AUTOLINK_SUPPORT       = YES
+AUTOLINK_SUPPORT       = NO
 
 # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
 # to include (a tag file for) the STL sources as input, then you should set this
@@ -390,7 +390,7 @@ AUTOLINK_SUPPORT       = YES
 # diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
-BUILTIN_STL_SUPPORT    = NO
+BUILTIN_STL_SUPPORT    = YES
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
@@ -636,7 +636,7 @@ CASE_SENSE_NAMES       = YES
 # scope will be hidden.
 # The default value is: NO.
 
-HIDE_SCOPE_NAMES       = NO
+HIDE_SCOPE_NAMES       = YES
 
 # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
 # append additional text to a page's title, such as Class Reference. If set to
@@ -944,7 +944,11 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = $(FF_HOME)/lib \
-                         $(FF_HOME)/bin
+                         $(FF_HOME)/bin \
+                         $(FF_HOME)/index.dox \
+                         $(FF_HOME)/contributing.dox \
+                         $(FF_HOME)/docs/sapling.dox \
+                         $(FF_HOME)/docs/realm-api.dox
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -990,8 +994,9 @@ FILE_PATTERNS          = *.c \
                          *.cu \
                          *.h \
                          *.hpp \
-                         *.md \
-                         *.py
+                         *.py \
+                         *.dox \
+                         *.md
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -1006,7 +1011,9 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                =
+EXCLUDE                = lib/realm-execution/include/realm-execution/realm.h \
+                         lib/runtime/ \
+                         lib/local-pcg-execution/
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -1022,7 +1029,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = */tl/*
+EXCLUDE_PATTERNS       = */tl/* */test/* */hip/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -1036,7 +1043,8 @@ EXCLUDE_SYMBOLS        =
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           =
+EXAMPLE_PATH           = $(FF_HOME)/lib \
+                         $(FF_HOME)/bin
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -1050,7 +1058,7 @@ EXAMPLE_PATTERNS       = *
 # irrespective of the value of the RECURSIVE tag.
 # The default value is: NO.
 
-EXAMPLE_RECURSIVE      = NO
+EXAMPLE_RECURSIVE      = YES
 
 # The IMAGE_PATH tag can be used to specify one or more files or directories
 # that contain images that are to be included in the documentation (see the
@@ -2432,7 +2440,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # set to NO
 # The default value is: NO.
 
-HAVE_DOT               = NO
+HAVE_DOT               = YES
 
 # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
 # to run in parallel. When set to 0 doxygen will base this on the number of
@@ -2651,7 +2659,7 @@ DOT_IMAGE_FORMAT       = png
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-INTERACTIVE_SVG        = NO
+INTERACTIVE_SVG        = YES
 
 # The DOT_PATH tag can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
diff --git a/docs/realm-api.dox b/docs/realm-api.dox
new file mode 100644
index 0000000000..8fe7aba443
--- /dev/null
+++ b/docs/realm-api.dox
@@ -0,0 +1,7 @@
+/**
+
+\page realm-api Realm API Reference
+
+- \anchor realm-instance <a href="https://legion.stanford.edu/realm/doc/main/classRealm_1_1RegionInstance.html">Realm::RegionInstance</a>
+
+*/
diff --git a/docs/SAPLING.md b/docs/sapling.dox
similarity index 95%
rename from docs/SAPLING.md
rename to docs/sapling.dox
index ad36c3e9cb..143e09c6f7 100644
--- a/docs/SAPLING.md
+++ b/docs/sapling.dox
@@ -1,4 +1,6 @@
-# Setup Guide for sapling
+/**
+
+@page sapling-setup Setup Guide for sapling
 
 1. ssh into the sapling head node.
 
@@ -82,8 +84,10 @@ NIXPKGS_ALLOW_UNFREE=1 nix develop .#gpu --accept-flake-config --impure
 (ff) $ proj test
 ...
 ```
-You should see the additional GPU tests run. If you instead see a message like 
+You should see the additional GPU tests run. If you instead see a message like
 
 > `Error: ... Pass --skip-gpu-tests to skip running tests that require a GPU`
 
-Double check that you are correctly in the `gpu` devshell, not the `default` devshell. 
+Double check that you are correctly in the `gpu` devshell, not the `default` devshell.
+
+*/
diff --git a/flake.lock b/flake.lock
index 9cd1e4bbae..359fdb19a9 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1769666654,
-        "narHash": "sha256-YFbOVi+Se3KDGFAoofYwYPUpEkEhsvdGdlYDR2I2XmI=",
+        "lastModified": 1773224815,
+        "narHash": "sha256-A7JWZNzYwzMZigyqm8IzyiBu82iFznp+oZJzx0eZjmU=",
         "ref": "refs/heads/master",
-        "rev": "64620d82f03478496eb00188184dbf48d56b560d",
-        "revCount": 143,
+        "rev": "d1db2bac548f66912d22023a3cece241ded1f503",
+        "revCount": 145,
         "type": "git",
         "url": "https://git.sr.ht/~lockshaw/proj"
       },
diff --git a/flake.nix b/flake.nix
index 6ccd5616cd..3e5c477dea 100644
--- a/flake.nix
+++ b/flake.nix
@@ -30,8 +30,8 @@
     };
   };
 
-  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
-    let 
+  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system:
+    let
       pkgs = import nixpkgs {
         inherit system;
         config.allowUnfree = true;
@@ -41,21 +41,21 @@
       mkShell = attrs: pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
       } (attrs // {
-        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch 
-                                    # signed overflows due to the signedoverflow hardening setting. 
-                                    # for more details, see the following (long-running) nixpkgs github issues: 
+        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch
+                                    # signed overflows due to the signedoverflow hardening setting.
+                                    # for more details, see the following (long-running) nixpkgs github issues:
                                     # - https://github.com/NixOS/nixpkgs/issues/18995
                                     # - https://github.com/NixOS/nixpkgs/issues/60919
       });
 
       proj = proj-repo.packages.${system}.proj;
-    in 
+    in
     {
       packages = rec {
         libdwarf-lite = pkgs.callPackage ./.flake/pkgs/libdwarf-lite.nix { };
         cpptrace = pkgs.callPackage ./.flake/pkgs/cpptrace.nix { inherit libdwarf-lite; };
         libassert = pkgs.callPackage ./.flake/pkgs/libassert.nix { inherit cpptrace; };
-        legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        realm = pkgs.callPackage ./.flake/pkgs/realm.nix { };
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
@@ -83,8 +83,7 @@
           shellHook = ''
             export PATH="$HOME/ff/.scripts/:$PATH"
             export RC_PARAMS="max_discard_ratio=100"
-            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \
-                                -DFF_USE_EXTERNAL_NCCL=ON \
+            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_NCCL=ON \
                                 -DFF_USE_EXTERNAL_JSON=ON \
                                 -DFF_USE_EXTERNAL_FMT=ON \
                                 -DFF_USE_EXTERNAL_SPDLOG=ON \
@@ -94,7 +93,7 @@
                                 -DFF_USE_EXTERNAL_GBENCHMARK=ON \
                                 -DFF_USE_EXTERNAL_LIBASSERT=ON"
           '';
-          
+
           buildInputs = builtins.concatLists [
             (with pkgs; [
               zlib
@@ -119,13 +118,14 @@
               compdb
               gbenchmark
               libtorch-bin
+              graphviz # for documentation
             ])
             (with proj-repo.packages.${system}; [
               proj
             ])
             (with self.packages.${system}; [
               libassert
-              legion
+              realm
               rapidcheckFull
               doctest
             ])
diff --git a/index.dox b/index.dox
new file mode 100644
index 0000000000..cae3197cdf
--- /dev/null
+++ b/index.dox
@@ -0,0 +1,42 @@
+namespace FlexFlow {
+/**
+
+\mainpage FlexFlow Train
+
+\brief FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies.
+
+\section root-layout Project Layout
+
+The bulk of the FlexFlow source code is stored in the following folders:
+
+- \subpage lib "": The C++ code that makes up FlexFlow's core, split up into a number of libraries.
+- \subpage bin "": Command-line interfaces for FlexFlow and associated tools (all in C++). Generally, these are just thin wrappers that parse command-line arguments and then call out to functions defined in \ref lib for the actual processing/logic. You can find a description of each binary \ref bin "here".
+- `bindings`: Python (or any additional languages added in the future) bindings for FlexFlow Train. Still mostly unimplemented.
+- `docs`: Config files for documentation generators and code for generating diagrams. The actual documentation itself is included in the source directories/files in <a href="https://www.doxygen.nl/manual/index.html">Doxygen</a> syntax either in standalone `.dox` files or inline in header files.
+- `cmake`: CMake configuration for building FlexFlow Train. Note that unless you're modifying the build configuration (i.e., adding a library, additional dependencies, etc.), you generally should use \ref contributing-proj "proj" instead of interacting with CMake directly.
+
+\section root-contributing Contributing
+
+Please let us know if you encounter any bugs or have any suggestions by <a href="https://github.com/flexflow/flexflow-train/issues">submitting an issue</a>.
+
+For instructions on how to contribute code to FlexFlow Train, see \subpage contributing.
+
+We welcome all contributions to FlexFlow Train from bug fixes to new features and extensions.
+
+\section root-citations Citations
+
+- Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022.
+
+- Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019.
+
+- Zhihao Jia, Sina Lin, Charles R. Qi, and Alex Aiken. [Exploring Hidden Dimensions in Parallelizing Convolutional Neural Networks](http://proceedings.mlr.press/v80/jia18a/jia18a.pdf). In Proceedings of the International Conference on Machine Learning (ICML), July 2018.
+
+\section root-team The Team
+
+FlexFlow Train is developed and maintained by teams at CMU, Facebook, Los Alamos National Lab, MIT, Stanford, and UCSD (alphabetically).
+
+\section root-license License
+FlexFlow Train uses Apache License 2.0.
+
+*/
+}
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2e71e577c0..cb3bd6d6ae 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
 add_subdirectory(local-pcg-execution)
+add_subdirectory(realm-execution)
 add_subdirectory(task-spec)
 add_subdirectory(utils)
 add_subdirectory(ffi)
diff --git a/lib/README.md b/lib/README.md
deleted file mode 100644
index 5600c8e6aa..0000000000
--- a/lib/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# C++ Library Code
-
-This directory contains the core C++ code that underlies FlexFlow, organized into the following libraries:
-
-- `compiler`: Contains 
-- `kernels`:
-- `op-attrs`:
-- `pcg`: Contains the definitions of computation graphs and parallel computation graphs,
-         as well as code for serializing and deserializing both graphs
-- `runtime`:
-- `substitutions`: Contains the definitions of pcg substitutions, as well as the code for serializing them
-- `utils`:
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
index 6a3d4987ac..42435312c3 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.dtg.toml
@@ -7,6 +7,12 @@ features = [
   "fmt",
   "hash",
 ]
+docstring = """
+@brief The minimum amount of information needed to compute the cost of an
+operator (runtime and memory).
+
+For the runtime-only analogue, see RuntimeOnlyOpCostEstimateKey
+"""
 
 includes = [
   "op-attrs/pcg_operator_attrs.dtg.h",
diff --git a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
index b08ca57851..aebca09ab8 100644
--- a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
@@ -24,7 +24,7 @@ namespace FlexFlow {
  */
 SearchResult apply_substitution_and_update_machine_mapping(
     SearchResult const &mapped_pcg,
-    Substitution const &sub,
+    Substitution const &substitution,
     PCGPatternMatch const &match);
 
 } // namespace FlexFlow
diff --git a/lib/compiler/include/compiler/machine_mapping/index.dox b/lib/compiler/include/compiler/machine_mapping/index.dox
new file mode 100644
index 0000000000..67452f2cb7
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/index.dox
@@ -0,0 +1,14 @@
+namespace FlexFlow {
+/**
+
+@page machine-mapping Machine Mapping
+
+@brief Contains the representations and logic for mappings of operators to machines/devices/GPUs.
+
+Core functionality includes:
+- \ref MachineView "": the compiler-side representation of a mapping.
+  For the runtime-side representation, see \ref MappedOperatorTaskGroup.
+- \ref allowed_machine_views.h
+
+*/
+}
diff --git a/lib/compiler/include/compiler/task_graph_simulator/index.dox b/lib/compiler/include/compiler/task_graph_simulator/index.dox
new file mode 100644
index 0000000000..c0a481b3a1
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page task-graph-simulator compiler/task_graph_simulator
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/compiler/include/compiler/unity_algorithm/index.dox b/lib/compiler/include/compiler/unity_algorithm/index.dox
new file mode 100644
index 0000000000..97d0865502
--- /dev/null
+++ b/lib/compiler/include/compiler/unity_algorithm/index.dox
@@ -0,0 +1,8 @@
+/**
+
+@page unity-dp-algorithm Unity DP Algorithm
+
+\see
+  https://www.lockshaw.net/static/unity.pdf
+
+*/
diff --git a/lib/compiler/index.dox b/lib/compiler/index.dox
new file mode 100644
index 0000000000..236b42a76c
--- /dev/null
+++ b/lib/compiler/index.dox
@@ -0,0 +1,13 @@
+/**
+
+\page compiler compiler
+
+\brief Implements the core logic of the compiler.
+
+This includes:
+
+- \subpage unity-dp-algorithm "Unity DP Algorithm"
+- \subpage task-graph-simulator "Task Graph Simulator"
+- \subpage machine-mapping "Machine Mapping"
+
+*/
diff --git a/lib/index.dox b/lib/index.dox
new file mode 100644
index 0000000000..69c52ae378
--- /dev/null
+++ b/lib/index.dox
@@ -0,0 +1,73 @@
+namespace FlexFlow {
+/**
+
+\page lib lib
+
+This directory contains the core C++ code that underlies %FlexFlow, organized into the following libraries:
+
+- \subpage compiler "": \copybrief compiler
+- \subpage kernels "": \copybrief kernels
+- \subpage op-attrs "": \copybrief op-attrs
+- \subpage pcg "": \copybrief pcg
+- \subpage substitutions "": \copybrief substitutions
+- \subpage utils "": \copybrief utils
+- \subpage models "": \copybrief models
+- \subpage task-spec "": \copybrief task-spec
+- \subpage local-execution "": \copybrief local-execution
+- \subpage realm-execution "": \copybrief realm-execution
+
+\section runtime-vs-compiler Runtime vs. Compiler
+
+Logically, the functionality in \c lib/ be split into two conceptual categories: the \e compiler and the \e runtime:
+
+- The compiler takes in a \ref ComputationGraph provided by the end user and transforms it into an optimized \ref MappedParallelComputationGraph.
+- The runtime takes in a \ref MappedParallelComputationGraph, which can be from the compiler or from any other tool that can generate files in FlexFlow's format (see \ref file-format for more details) and executes it, i.e., performs training iterations using the (usually distributed) execution strategy specified by the \ref MappedParallelComputationGraph.
+
+The distinction between the two in terms of libraries is a bit less clear.
+A few libraries are used exclusively in the compiler (i.e., \ref compiler) or in the runtime (\ref realm-execution), but most are used to some degree in both: for example, while \ref op-attrs is specified to contain the compiler-side representations of the operators, these representations are included in the \ref MappedParallelComputationGraph that is executed by the runtime, and so the library is actually used by both.
+In practice we use the shorthands "runtime-side" and "compiler-side" to refer to the following division of libraries:
+- compiler-side: \ref utils, \ref op-attrs, \ref pcg, \ref substitutions, and \ref compiler
+- runtime-side: \ref task-spec, \ref kernels, \ref local-execution, and \ref realm-execution.
+- neither: \ref models.
+
+The full (transitively-reduced) dependency graph of the libraries is as follows:
+
+\dot
+digraph example {
+    utils          [label="utils", URL="\ref utils", color="forestgreen", fontcolor="forestgreen"];
+    opattrs        [label="op-attrs", URL="\ref op-attrs", color="forestgreen", fontcolor="forestgreen"];
+    pcg            [label="pcg-attrs", URL="\ref pcg", color="forestgreen", fontcolor="forestgreen"];
+    substitutions  [label="substitutions", URL="\ref substitutions", color="forestgreen", fontcolor="forestgreen"];
+    models         [label="models", URL="\ref models"];
+    compiler       [label="compiler", URL="\ref compiler", color="forestgreen", fontcolor="forestgreen"];
+    kernels        [label="kernels", URL="\ref kernels", color="red", fontcolor="red"];
+    taskspec       [label="task-spec", URL="\ref task-spec", color="red", fontcolor="red"];
+    localexecution [label="local-execution", URL="\ref local-execution", color="red", fontcolor="red"];
+    realmexecution [label="realm-execution", URL="\ref realm-execution", color="red", fontcolor="red"];
+    realm          [label="Realm", color="red", fontcolor="red", style="dashed"];
+
+    utils -> opattrs;
+    opattrs -> pcg;
+    pcg -> substitutions;
+    substitutions -> compiler;
+    pcg -> models;
+    pcg -> kernels;
+    localexecution -> compiler [ style="dashed" ];
+    kernels -> taskspec;
+    taskspec -> localexecution;
+    localexecution -> realmexecution;
+    realm -> realmexecution;
+    realm -> kernels [ style="dashed" ];
+}
+\enddot
+
+where solid arrows represent link-time dependencies and dashed arrows represent run-time-only dependencies.
+
+\section lib-deprecated-components Deprecated Components
+
+- \c "local-pcg-execution":
+- \c "ffi":
+- \c "substitution-generator":
+- \c "runtime": Out-of-date code migrated from the old %FlexFlow codebase. Currently kept around for reference, but will eventually be removed.
+*/
+}
diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h
index 9b7769355e..0836503717 100644
--- a/lib/kernels/include/kernels/device_handle_t.h
+++ b/lib/kernels/include/kernels/device_handle_t.h
@@ -9,6 +9,9 @@ namespace FlexFlow {
 device_handle_t device_handle_t_from_managed_handle(
     std::optional<ManagedPerDeviceFFHandle> const &managed_handle);
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle);
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle);
 device_handle_t cpu_make_device_handle_t();
 
diff --git a/lib/kernels/index.dox b/lib/kernels/index.dox
new file mode 100644
index 0000000000..085fa513e7
--- /dev/null
+++ b/lib/kernels/index.dox
@@ -0,0 +1,7 @@
+/**
+
+\page kernels kernels
+
+\brief %CPU and %GPU implementations of the operators, for use in the runtime and in operator profiling.
+
+*/
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index bfa2169b0d..a3f8ead17f 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -299,6 +299,7 @@ namespace std {
 
 using namespace ::FlexFlow;
 
+///\cond
 size_t hash<GenericTensorAccessorR>::operator()(
     GenericTensorAccessorR const &a) const {
   return get_std_hash(a.tie());
@@ -308,5 +309,6 @@ size_t hash<GenericTensorAccessorW>::operator()(
     GenericTensorAccessorW const &a) const {
   return get_std_hash(a.tie());
 }
+///\endcond
 
 } // namespace std
diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc
index 85f9e2a388..0225ee8e94 100644
--- a/lib/kernels/src/kernels/device_handle_t.cc
+++ b/lib/kernels/src/kernels/device_handle_t.cc
@@ -11,6 +11,15 @@ device_handle_t device_handle_t_from_managed_handle(
   }
 }
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  if (managed_handle.has_value()) {
+    return gpu_make_device_handle_t(managed_handle.value()->raw_handle());
+  } else {
+    return cpu_make_device_handle_t();
+  }
+}
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle) {
   return device_handle_t{
       ff_handle,
diff --git a/lib/local-execution/include/local-execution/README.md b/lib/local-execution/include/local-execution/README.md
deleted file mode 100644
index cc68162afc..0000000000
--- a/lib/local-execution/include/local-execution/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-The primary external-facing interface of local-execution.
-
-Major components:
-
-* `computation_graph_instance.h`: is the main external facing interface
-  * Takes a `ComputationGraph` as input, expands and initializes it
-  * Provides various methods to run all or a subset of passes
-* `local_task_registry.h`: functions to retrieve task implementations
-  * Not a dynamic registry: tasks are all static now
-* `local_task_argument_accessor.h`: local wrapper for `ITaskArgumentAccessor`
-  * Stores all of the necessary data required for a task to execute
-* `task_execution.h`: utilities to prepare and execute tasks
-* `tensor_allocation.h`: a pass for the dataflow graph that allocates all tensors
diff --git a/lib/local-execution/include/local-execution/device_state_initialization.h b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h
similarity index 81%
rename from lib/local-execution/include/local-execution/device_state_initialization.h
rename to lib/local-execution/include/local-execution/per_device_op_state_initialization.h
index 6abd58a32c..abf24cdfd1 100644
--- a/lib/local-execution/include/local-execution/device_state_initialization.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H
-#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H
 
 #include "kernels/allocation.h"
 #include "kernels/device_handle_t.dtg.h"
@@ -25,7 +25,7 @@ DynamicNodeInvocation
 /**
  * @brief Initialize all operators and save the per-device op state
  */
-DynamicOpenDataflowGraph perform_device_state_initialization(
+DynamicOpenDataflowGraph perform_per_device_op_state_initialization(
     DynamicOpenDataflowGraph const &,
     Allocator &allocator,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/local-execution/index.dox b/lib/local-execution/index.dox
new file mode 100644
index 0000000000..9c82327404
--- /dev/null
+++ b/lib/local-execution/index.dox
@@ -0,0 +1,25 @@
+namespace FlexFlow {
+/**
+
+\page local-execution local-execution
+
+\brief Executes non-distributed \ref DynamicOpenDataflowGraph on local devices without using Realm.
+       Used for testing and inside of Realm to execute specific operators.
+       Future uses may also include fusing operator task launches in Realm.
+
+The primary external-facing interface of local-execution.
+
+Major components:
+
+- \ref "computation_graph_instance.h": is the main external facing interface
+  - Takes a FlexFlow::ComputationGraph as input, expands and initializes it
+  - Provides various methods to run all or a subset of passes
+- \ref "local_task_registry.h": functions to retrieve task implementations
+  - Not a dynamic registry: tasks are all static now
+- \ref "local_task_argument_accessor.h": local wrapper for FlexFlow::ITaskArgumentAccessor
+  - Stores all of the necessary data required for a task to execute
+- \ref "task_execution.h": utilities to prepare and execute tasks
+- \ref "tensor_allocation.h": a pass for the dataflow graph that allocates all tensors
+
+*/
+}
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index e251fafe5f..8c3a30a82d 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -1,5 +1,5 @@
 #include "local-execution/computation_graph_instance/computation_graph_instance.h"
-#include "local-execution/device_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
 #include "local-execution/task_execution.h"
 #include "local-execution/tensor_allocation.h"
 #include "pcg/optimizer_attrs.h"
@@ -81,7 +81,8 @@ ComputationGraphInstance create_computation_graph_instance(
     auto [loss_inserted_dg, label_v, logit_grad_v] = perform_loss_insertion(
         dg,
         assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)});
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        std::nullopt);
     dg = loss_inserted_dg;
     logit_grad_value = logit_grad_v;
     inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
@@ -95,13 +96,13 @@ ComputationGraphInstance create_computation_graph_instance(
         return get_loss_tensor_accessor(dg, lgv);
       });
 
-  dg = perform_device_state_initialization(dg,
-                                           allocator,
-                                           profiling_settings,
-                                           device_handle,
-                                           iteration_config,
-                                           optimizer_attrs,
-                                           device_idx);
+  dg = perform_per_device_op_state_initialization(dg,
+                                                  allocator,
+                                                  profiling_settings,
+                                                  device_handle,
+                                                  iteration_config,
+                                                  optimizer_attrs,
+                                                  device_idx);
 
   // Compute the topological ordering of the graph
   auto [kwarg_graph, node_map] =
@@ -133,7 +134,7 @@ static std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
             /*per_device_op_state=*/
             transform(invocation.node_attrs.per_device_op_state,
                       [&](DeviceSpecificPerDeviceOpState const &op_state) {
-                        return get_device_state_from_device_specific(
+                        return get_per_device_op_state_from_device_specific(
                             op_state, device_idx);
                       }),
             /*iteration_config=*/iteration_config,
diff --git a/lib/local-execution/src/local-execution/device_state_initialization.cc b/lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
similarity index 95%
rename from lib/local-execution/src/local-execution/device_state_initialization.cc
rename to lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
index b5462b4b78..2cd53b428b 100644
--- a/lib/local-execution/src/local-execution/device_state_initialization.cc
+++ b/lib/local-execution/src/local-execution/per_device_op_state_initialization.cc
@@ -1,4 +1,4 @@
-#include "local-execution/device_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
 #include "local-execution/local_task_registry.h"
 #include "local-execution/task_execution.h"
 #include "op-attrs/computation_graph_op_attrs.dtg.h"
@@ -57,7 +57,7 @@ DynamicNodeInvocation
   return result;
 }
 
-DynamicOpenDataflowGraph perform_device_state_initialization(
+DynamicOpenDataflowGraph perform_per_device_op_state_initialization(
     DynamicOpenDataflowGraph const &dg,
     Allocator &allocator,
     ProfilingSettings const &profiling_settings,
diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc
index a74d165a31..615ba204cf 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/test_e2e.cc
@@ -21,8 +21,8 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
-                       GenericTensorAccessorR const &last_epoch) {
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
   return tensor_accessor_all(
diff --git a/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc b/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
index b94f7378ac..363e918190 100644
--- a/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
+++ b/lib/local-pcg-execution/src/local-pcg-execution/mapped_per_device_op_states_group.cc
@@ -76,7 +76,7 @@ std::tuple<
 }
 
 bidict<MachineSpaceCoordinate, OperatorAtomicTaskShardBinding> const &
-    MappedPerDeviceOpStatesGroup::get_shard_bindings() const {
+    MappedPerDeviceOpStatesGroup::get_per_device_op_states() const {
   return this->shard_bindings;
 }
 
diff --git a/lib/models/include/models/bert/bert.h b/lib/models/include/models/bert/bert.h
index 0047996b78..51c5a694c9 100644
--- a/lib/models/include/models/bert/bert.h
+++ b/lib/models/include/models/bert/bert.h
@@ -31,10 +31,10 @@ BertConfig get_default_bert_config();
  *
  * @note This is a plain encoder-only model for pre-training.
  *
- * @param BertConfig The config of BERT model.
+ * @param config The config of BERT model.
  * @return ComputationGraph The computation graph of a BERT model.
  */
-ComputationGraph get_bert_computation_graph(BertConfig const &);
+ComputationGraph get_bert_computation_graph(BertConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/bert/index.dox b/lib/models/include/models/bert/index.dox
new file mode 100644
index 0000000000..f923a93480
--- /dev/null
+++ b/lib/models/include/models/bert/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-bert models/bert
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/candle_uno/candle_uno.h b/lib/models/include/models/candle_uno/candle_uno.h
index a2d21f2830..efc99653be 100644
--- a/lib/models/include/models/candle_uno/candle_uno.h
+++ b/lib/models/include/models/candle_uno/candle_uno.h
@@ -31,10 +31,11 @@ CandleUnoConfig get_default_candle_uno_config();
  * map from specific data identifier in the dataset to the feature name used in
  * this model.
  *
- * @param CandleUnoConfig The config of the Candle Uno model.
- * @return ComputationGraph The PCG of a Transformer model.
+ * @param config The config of the Candle Uno model.
+ * @return The PCG of a Transformer model.
  */
-ComputationGraph get_candle_uno_computation_graph(CandleUnoConfig const &);
+ComputationGraph
+    get_candle_uno_computation_graph(CandleUnoConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/candle_uno/index.dox b/lib/models/include/models/candle_uno/index.dox
new file mode 100644
index 0000000000..7845dca599
--- /dev/null
+++ b/lib/models/include/models/candle_uno/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-candle-uno models/candle_uno
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/dlrm/dlrm.h b/lib/models/include/models/dlrm/dlrm.h
index c3443f3b9b..481f02957d 100644
--- a/lib/models/include/models/dlrm/dlrm.h
+++ b/lib/models/include/models/dlrm/dlrm.h
@@ -48,8 +48,8 @@ tensor_guid_t create_dlrm_interact_features(
 /**
  * @brief Get the DLRM computation graph.
  *
- * @param DLRMConfig The config of DLRM model.
- * @return ComputationGraph The computation graph of a DLRM model.
+ * @param config The config of DLRM model.
+ * @return The computation graph of a DLRM model.
  */
 ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config);
 
diff --git a/lib/models/include/models/dlrm/index.dox b/lib/models/include/models/dlrm/index.dox
new file mode 100644
index 0000000000..1c952bc8f5
--- /dev/null
+++ b/lib/models/include/models/dlrm/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-dlrm models/dlrm
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/inception_v3/index.dox b/lib/models/include/models/inception_v3/index.dox
new file mode 100644
index 0000000000..006e7d6334
--- /dev/null
+++ b/lib/models/include/models/inception_v3/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-inception models/inception_v3
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/split_test/index.dox b/lib/models/include/models/split_test/index.dox
new file mode 100644
index 0000000000..501eb1111d
--- /dev/null
+++ b/lib/models/include/models/split_test/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+@page split-test models/split_test
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/transformer/index.dox b/lib/models/include/models/transformer/index.dox
new file mode 100644
index 0000000000..551735929c
--- /dev/null
+++ b/lib/models/include/models/transformer/index.dox
@@ -0,0 +1,10 @@
+namespace FlexFlow {
+/**
+
+\page models-transformer models/transformer
+
+\todo
+  \@lockshaw Add docs and example(s).
+
+*/
+}
diff --git a/lib/models/include/models/transformer/transformer.h b/lib/models/include/models/transformer/transformer.h
index 385100a4c9..20636bc524 100644
--- a/lib/models/include/models/transformer/transformer.h
+++ b/lib/models/include/models/transformer/transformer.h
@@ -37,10 +37,11 @@ TransformerConfig get_default_transformer_config();
 /**
  * @brief Get the Transformer computation graph.
  *
- * @param TransformerConfig The config of Transformer model.
- * @return ComputationGraph The PCG of a Transformer model.
+ * @param config The config of Transformer model.
+ * @return The PCG of a Transformer model.
  */
-ComputationGraph get_transformer_computation_graph(TransformerConfig const &);
+ComputationGraph
+    get_transformer_computation_graph(TransformerConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/index.dox b/lib/models/index.dox
new file mode 100644
index 0000000000..8046ef125f
--- /dev/null
+++ b/lib/models/index.dox
@@ -0,0 +1,19 @@
+namespace FlexFlow {
+/**
+
+\page models models
+
+\brief Pre-built \ref ComputationGraph ""s for various models for use in testing and evalutation.
+
+\section real-models Real Models
+- \subpage models-bert "BERT"
+- \subpage models-candle-uno "Candle UNO"
+- \subpage models-dlrm "DLRM"
+- \subpage models-inception "Inception v3"
+- \subpage models-transformer "Transformer"
+
+\section test-models Artificial Models for Testing
+- \subpage split-test
+
+*/
+}
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index 67d2f74ce0..13cc42b356 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -5,6 +5,7 @@
 namespace FlexFlow {
 
 ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
+  //! [ComputationGraphBuilder example]
   ComputationGraphBuilder cgb;
 
   positive_int layer_dim1 = 256_p;
@@ -34,6 +35,7 @@ ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
   t = cgb.softmax(t);
 
   return cgb.computation_graph;
+  //! [ComputationGraphBuilder example]
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h b/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
index 67873c32b1..674f18b919 100644
--- a/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
+++ b/lib/op-attrs/include/op-attrs/initializers/uniform_initializer_attrs.h
@@ -4,6 +4,7 @@
 #include "op-attrs/initializers/uniform_initializer_attrs.dtg.h"
 #include <rapidcheck.h>
 
+///\cond
 namespace rc {
 
 template <>
@@ -12,5 +13,6 @@ struct Arbitrary<::FlexFlow::UniformInitializerAttrs> {
 };
 
 } // namespace rc
+///\endcond
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/index.dox b/lib/op-attrs/include/op-attrs/ops/index.dox
new file mode 100644
index 0000000000..6e5465ca68
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/index.dox
@@ -0,0 +1,45 @@
+namespace FlexFlow {
+/**
+
+\page op-attrs-ops op-attrs/ops
+
+\brief Contains the compiler-side definitions of the operators.
+
+More specifically, this consists of the following pieces:
+
+- A representation of the operator attributes (e.g., @ref LinearAttrs)
+- Functions for inferring weight and output shapes from the set of input shapes, e.g.,
+  - \ref "get_projection_shape(LinearAttrs const &, TensorShape const &)",
+  - \ref "get_bias_shape(LinearAttrs const &, TensorShape const &)", and
+  - \ref "get_output_shape(LinearAttrs const &, TensorShape const &)").
+
+  This procedure is termed <em>shape inference</em>.
+
+- Functions for inferring the parallelized weight and output shapes from the shapes of the parallelized input tensors, e.g.,
+  - \ref "get_projection_shape(LinearAttrs const &, ParallelTensorShape const &)",
+  - \ref "get_bias_shape(LinearAttrs const &, ParallelTensorShape const &)", and
+  - \ref "get_output_shape(LinearAttrs const &, ParallelTensorShape const &)"
+
+  This procedure is termed <em>parallel shape inference</em>.
+  The recommended way to do this currently is to exploit the fact that a \ref ParallelTensorShape is equivalent to a pair of a \ref TensorShape and a \ref ParallelTensorDimDegrees and replace the implementations of the above parallel shape inference functions with the following:
+  - \ref "get_projection_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+  - \ref "get_bias_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+  - \ref "get_output_parallel_dim_degrees(LinearAttrs const &attrs, ParallelTensorDimDegrees const &input)"
+
+  This allows us to implement parallel shape inference as a simple composition of functions, as in the following snippet from the %Linear operator:
+  \snippet lib/op-attrs/src/op-attrs/ops/linear.cc parallel shape inference composition example
+
+- A function for inferring the slot names for the incoming tensors (
+  \ref "get_linear_incoming_tensor_roles(LinearAttrs const &)")
+- Functions for computing the dependencies between shards of the parallelized input, weight, and output tensors, e.g.,
+  - \ref "get_operator_to_input_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "get_operator_to_projection_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+  - \ref "get_operator_to_output_mapping(LinearAttrs const &, ParallelTensorDimDegrees const &input_degrees)"
+
+Note that as different operators have different numbers of inputs, etc. the number and signatures of these functions may be different for different operators. While keeping the structure of the various operators similar is makes it easier to understand, it's not strictly necessary: the code that calls these functions for a generic operator allows custom behavior for each operator, which allows us to have a bit more freedom to evolve operator definitions over time:
+- \ref get_operator_to_ptensor_mappings (and associated functions in \ref get_operator_space_to_parallel_tensor_space_mappings.h)
+- \ref "get_incoming_tensor_roles(ComputationGraphOpAttrs const &)" (and associated functions in \ref get_incoming_tensor_roles.h)
+- \ref "get_output_shapes(ComputationGraphOpAttrs const &, std::unordered_map<TensorSlotName, TensorShape> const &input_shapes)" (and associated functions in \ref op-attrs/shape_inference.h)
+
+*/
+}
diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
index 9c8e0587c6..d771335cf6 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
+++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.dtg.toml
@@ -9,6 +9,11 @@ features = [
   "rapidcheck",
   "fmt",
 ]
+docstring = """
+@brief Compiler-side representation of a %Linear operator.
+
+For details on how operators are represented on the compiler side, see @ref op-attrs-ops.
+"""
 
 includes = [
   "op-attrs/datatype.dtg.h",
diff --git a/lib/op-attrs/index.dox b/lib/op-attrs/index.dox
new file mode 100644
index 0000000000..86cce3594b
--- /dev/null
+++ b/lib/op-attrs/index.dox
@@ -0,0 +1,19 @@
+namespace FlexFlow {
+/**
+
+\page op-attrs op-attrs
+
+\brief Contains the compiler-side definition of all of the operators and associated functions for reasoning about their behavior, as well as the fundamental concepts needed to represent them.
+
+Key pieces include:
+
+- Representing tensors in the compiler:
+  \ref TensorShape, \ref TensorDims
+- Representing parallel/sharded/distributed tensors in the compiler:
+  \ref ParallelTensorShape, \ref ParallelTensorDimDegrees
+- The actual operator definitions: \subpage op-attrs-ops "ops/"
+- Computing data dependencies of operators computing over parallel tensors:
+  \ref get_operator_space_to_parallel_tensor_space_mappings.h
+
+*/
+}
diff --git a/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc b/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
index 2c7065c9cc..8c87f7ce27 100644
--- a/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
+++ b/lib/op-attrs/src/op-attrs/initializers/uniform_initializer_attrs.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/initializers/uniform_initializer_attrs.h"
 
+///\cond
 namespace rc {
 
 using ::FlexFlow::UniformInitializerAttrs;
@@ -19,3 +20,4 @@ Gen<UniformInitializerAttrs> Arbitrary<UniformInitializerAttrs>::arbitrary() {
 };
 
 } // namespace rc
+///\endcond
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index 2518df77e4..a9f8fdf02a 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -109,6 +109,7 @@ tl::expected<std::unordered_map<TensorSlotName, TensorShape>, std::string>
   return weight_shapes;
 }
 
+//! [parallel shape inference composition example]
 tl::expected<ParallelTensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          ParallelTensorShape const &input) {
@@ -126,6 +127,7 @@ tl::expected<ParallelTensorShape, std::string>
 
   return lift_to_parallel_with_degrees(unpar, projection_degrees);
 }
+//! [parallel shape inference composition example]
 
 tl::expected<ParallelTensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, ParallelTensorShape const &input) {
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 064a4dd20d..4e4cacc731 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -6,6 +6,13 @@
 
 namespace FlexFlow {
 
+/**
+ * \brief A helper interface for building ComputationGraph in a pytorch (i.e.,
+ * weight-implicit) style.
+ *
+ * For an example of how to use it, see the following code from \ref models "":
+ * \snippet lib/models/src/models/split_test/split_test.cc ComputationGraphBuilder example
+ */
 struct ComputationGraphBuilder {
 public:
   ComputationGraphBuilder();
diff --git a/lib/pcg/include/pcg/file_format/v1/index.dox b/lib/pcg/include/pcg/file_format/v1/index.dox
new file mode 100644
index 0000000000..e6d0d4be4f
--- /dev/null
+++ b/lib/pcg/include/pcg/file_format/v1/index.dox
@@ -0,0 +1,5 @@
+/**
+
+@page file-format pcg/file_format/v1
+
+*/
diff --git a/lib/pcg/include/pcg/layer_guid_t.dtg.toml b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
index d73cf547da..2f2f7694a0 100644
--- a/lib/pcg/include/pcg/layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
index 5b1cad5e99..ebfdefa478 100644
--- a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
+++ b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
@@ -5,6 +5,7 @@
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
 #include "utils/bidict/bidict.h"
+#include <nlohmann/json.hpp>
 
 namespace FlexFlow {
 
@@ -45,4 +46,15 @@ struct hash<::FlexFlow::MappedOperatorTaskGroup> {
 };
 
 } // namespace std
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<::FlexFlow::MappedOperatorTaskGroup> {
+  static ::FlexFlow::MappedOperatorTaskGroup from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::MappedOperatorTaskGroup const &t);
+};
+
+} // namespace nlohmann
+
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 25dc0721cd..21f33f6d3d 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -32,6 +32,10 @@ ParallelLayerAddedResult add_parallel_layer(
 ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                                              TensorShape const &tensor_shape);
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape);
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
 
@@ -54,6 +58,9 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &);
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &,
+                         parallel_layer_guid_t const &);
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
@@ -107,6 +114,9 @@ ParallelTensorShape get_parallel_tensor_shape(ParallelComputationGraph const &,
 std::vector<parallel_layer_guid_t>
     topological_ordering(ParallelComputationGraph const &);
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg);
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name);
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
index 618bcb0dc4..292b361fc8 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
index 4494a31ac2..2710a15664 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
index 151f7b1f0f..e8caf0021f 100644
--- a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/index.dox b/lib/pcg/index.dox
new file mode 100644
index 0000000000..22e5e23903
--- /dev/null
+++ b/lib/pcg/index.dox
@@ -0,0 +1,24 @@
+namespace FlexFlow{
+/**
+
+\page pcg pcg
+
+@brief Defines the top-level datastructures (ComputationGraph, ParallelComputationGraph, and MappedParallelComputationGraph) and their serialization formats, along with some helper interfaces for constructing and manipulating them.
+
+\section pcg-datastructures Key Datastructures
+
+- \ref ComputationGraph "": aka CG
+- \ref ParallelComputationGraph "": aka PCG
+- \ref MappedParallelComputationGraph "": aka MPCG
+
+\section serialization-formats Serialization
+
+- \subpage file-format "pcg/file_format/"
+
+\section pcg-helpers Helper Functionality
+
+- \ref ComputationGraphBuilder ""
+- \ref ParallelComputationGraphBuilder ""
+
+*/
+}
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index d39652a7e2..8136d0e71c 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -5,6 +5,7 @@
 
 using namespace ::FlexFlow;
 
+///\cond
 namespace nlohmann {
 
 V1BinarySPDecomposition
@@ -82,3 +83,4 @@ void adl_serializer<V1BinaryParallelSplit>::to_json(
 }
 
 } // namespace nlohmann
+///\endcond
diff --git a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
index b96a447383..4436efd727 100644
--- a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
+++ b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -90,3 +90,20 @@ size_t hash<::FlexFlow::MappedOperatorTaskGroup>::operator()(
 }
 
 } // namespace std
+
+namespace nlohmann {
+
+::FlexFlow::MappedOperatorTaskGroup
+    adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::from_json(
+        json const &j) {
+  return ::FlexFlow::MappedOperatorTaskGroup{j.template get<
+      ::FlexFlow::bidict<::FlexFlow::MachineSpaceCoordinate,
+                         ::FlexFlow::OperatorAtomicTaskShardBinding>>()};
+}
+
+void adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::to_json(
+    json &j, ::FlexFlow::MappedOperatorTaskGroup const &t) {
+  j = t.get_shard_bindings();
+}
+
+} // namespace nlohmann
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index f83628b8e1..959747dbc7 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -142,6 +142,27 @@ ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                             });
 }
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape) {
+  ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
+  };
+
+  return add_parallel_layer(/*pcg=*/pcg,
+                            /*layer_attrs=*/layer_attrs,
+                            /*inputs=*/{},
+                            /*weights=*/{},
+                            /*output_flags=*/
+                            std::unordered_map<TensorSlotName, CreateGrad>{
+                                {
+                                    TensorSlotName::OUTPUT,
+                                    CreateGrad::YES,
+                                },
+                            });
+}
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer) {
   PCGOperatorAttrs op_attrs = pcg_get_op_attrs(pcg, layer);
@@ -212,6 +233,16 @@ std::unordered_set<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &pcg,
+                         parallel_layer_guid_t const &l) {
+  return map_values(get_outgoing_kwarg_dataflow_outputs_for_node(
+                        pcg.raw_graph, l.raw_graph_node),
+                    [](KwargDataflowOutput<TensorSlotName> const &o) {
+                      return parallel_tensor_guid_t{o};
+                    });
+}
+
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &pcg,
                          parallel_layer_guid_t const &l) {
@@ -378,6 +409,17 @@ std::vector<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg) {
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+      layer_attrs_mapping;
+  for (parallel_layer_guid_t const &layer_guid : get_parallel_layers(pcg)) {
+    layer_attrs_mapping.insert(
+        {layer_guid, get_parallel_layer_attrs(pcg, layer_guid)});
+  }
+  return layer_attrs_mapping;
+}
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name) {
diff --git a/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
new file mode 100644
index 0000000000..1c3667afc7
--- /dev/null
+++ b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -0,0 +1,42 @@
+#include "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h"
+#include "op-attrs/parallel_tensor_space_coordinate.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include <doctest/doctest.h>
+#include <nlohmann/json.hpp>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("adl_serializer<MappedOperatorTaskGroup>") {
+    bidict<MachineSpaceCoordinate, OperatorAtomicTaskShardBinding>
+        shard_bindings{
+            {MachineSpaceCoordinate{0_n, 0_n, DeviceType::CPU},
+             OperatorAtomicTaskShardBinding{
+                 {
+                     {TensorSlotName::INPUT,
+                      ParallelTensorSpaceCoordinate{
+                          0_n, 0_n, FFOrdered{1_n, 2_n, 3_n}}},
+                 },
+             }},
+        };
+    MappedOperatorTaskGroup deserialized{shard_bindings};
+    nlohmann::json serialized = shard_bindings;
+
+    SUBCASE("to_json") {
+      nlohmann::json result = deserialized;
+      nlohmann::json correct = serialized;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("from_json") {
+      MappedOperatorTaskGroup result = serialized;
+      MappedOperatorTaskGroup correct = deserialized;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
new file mode 100644
index 0000000000..08676525e1
--- /dev/null
+++ b/lib/realm-execution/CMakeLists.txt
@@ -0,0 +1,22 @@
+ff_add_library(
+  NAME
+    realm-execution
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    compiler
+    kernels
+    local-execution
+    op-attrs
+    pcg
+    spdlog
+    task-spec
+    utils
+    Realm::Realm
+)
+
+add_subdirectory(test)
diff --git a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
new file mode 100644
index 0000000000..da6ba86638
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+
+#include "realm-execution/realm.h"
+#include <vector>
+
+namespace FlexFlow {
+
+struct AtomicDependencySet {
+public:
+  AtomicDependencySet() = delete;
+  explicit AtomicDependencySet(Realm::Event precondition);
+
+  void add_writer(Realm::Event writer);
+  void add_reader(Realm::Event reader);
+
+  Realm::Event get_dependency_for_writer() const;
+  Realm::Event get_dependency_for_reader() const;
+
+private:
+  Realm::Event writer;
+  std::vector<Realm::Event> readers;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
new file mode 100644
index 0000000000..bd6ab04cea
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+
+#include "realm-execution/atomic_dependency_set.h"
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include <unordered_map>
+
+namespace FlexFlow {
+
+/**
+ * @brief Tracks dependencies during execution of tasks.
+ */
+struct DependencySet {
+public:
+  DependencySet() = delete;
+  explicit DependencySet(Realm::Event precondition);
+
+  void add_writer(DynamicValueAttrs const &value, Realm::Event writer);
+  void add_reader(DynamicValueAttrs const &value, Realm::Event reader);
+
+  Realm::Event get_dependency_for_writer(DynamicValueAttrs const &value) const;
+  Realm::Event get_dependency_for_reader(DynamicValueAttrs const &value) const;
+
+private:
+  AtomicDependencySet &
+      get_atomic_dependency_set(DynamicValueAttrs const &value);
+
+private:
+  Realm::Event precondition;
+  std::unordered_map<DynamicValueAttrs, AtomicDependencySet>
+      atomic_dependencies;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
new file mode 100644
index 0000000000..9a42861fcd
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/device_specific_ptr.h"
+#include <optional>
+
+namespace FlexFlow {
+
+using DeviceSpecificManagedPerDeviceFFHandle =
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle>;
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &, device_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
new file mode 100644
index 0000000000..590b7dbc74
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -0,0 +1,36 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+
+#include "pcg/device_id_t.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename T>
+struct DeviceSpecificPtr {
+public:
+  DeviceSpecificPtr() = delete;
+  explicit DeviceSpecificPtr(device_id_t device_idx, std::optional<T *> ptr)
+      : device_idx(device_idx), ptr(ptr) {}
+
+  std::optional<T *> get(device_id_t device_idx) const {
+    ASSERT(this->device_idx == device_idx);
+    return this->ptr;
+  }
+
+  device_id_t get_device_idx() const {
+    return this->device_idx;
+  }
+
+  std::optional<T *> get_unsafe_raw_ptr() const {
+    return this->ptr;
+  }
+
+private:
+  device_id_t device_idx;
+  std::optional<T *> ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/distributed_ff_handle.h b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
new file mode 100644
index 0000000000..2a500ff150
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_ff_handle.h
@@ -0,0 +1,46 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include <unordered_map>
+
+namespace FlexFlow {
+
+/**
+ * \brief Tracks the \ref device_handle_t (i.e., FFHandle) for each %GPU, both
+ * local and remote. A GPU here is represented by a Realm::Processor.
+ */
+struct DistributedFfHandle {
+public:
+  DistributedFfHandle() = delete;
+  explicit DistributedFfHandle(
+      std::unordered_map<Realm::Processor,
+                         DeviceSpecificManagedPerDeviceFFHandle> const
+          &handles);
+
+  DeviceSpecificManagedPerDeviceFFHandle const &
+      at(Realm::Processor processor) const;
+
+private:
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
+};
+
+/**
+ * \brief Launches tasks (using \ref spawn_ff_handle_init_task) to create
+ * the \ref device_handle_t ""s for each %GPU and packages the results into a
+ * DistributedFfHandle.
+ *
+ * \relates DistributedFfHandle
+ */
+DistributedFfHandle create_distributed_ff_handle(
+    RealmContext &ctx,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    Realm::Event precondition = Realm::Event::NO_EVENT);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
new file mode 100644
index 0000000000..0da97089ce
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_per_device_op_state_initialization.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_INITIALIZATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_INITIALIZATION_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Launches tasks (using \ref spawn_per_device_op_state_init_task) to
+ * create the \ref PerDeviceOpState ""s for each %GPU and packages the results
+ * into a PerDeviceOpStateBacking.
+ *
+ * \relates PerDeviceOpStateBacking
+ */
+PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
+    RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
+    ProfilingSettings const &profiling_settings,
+    DistributedFfHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
new file mode 100644
index 0000000000..bd304c5b4e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/permissions.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Turn a Realm region instance into a \ref GenericTensorAccessor by
+ * re-wrapping the raw pointer.
+ */
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_event.h b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
new file mode 100644
index 0000000000..a245968f39
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<
+    ::FlexFlow::Realm::Event,
+    Char,
+    std::enable_if_t<!detail::has_format_as<::FlexFlow::Realm::Event>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::Event const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
+    std::string result = fmt::format("<Event {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_instance.h b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
new file mode 100644
index 0000000000..e6d2846c1f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<::FlexFlow::Realm::RegionInstance,
+                 Char,
+                 std::enable_if_t<!detail::has_format_as<
+                     ::FlexFlow::Realm::RegionInstance>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::RegionInstance const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
+    std::string result = fmt::format("<RegionInstance {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
new file mode 100644
index 0000000000..66cc07af75
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Allocates a (potentially remote) Realm instance for \p value
+ * on the device represented by \p device_coord.
+ */
+std::pair<Realm::RegionInstance, Realm::Event>
+    perform_instance_allocation_for_value(
+        MachineSpaceCoordinate const &device_coord,
+        DynamicValueAttrs const &value,
+        RealmContext &ctx);
+
+/**
+ * @brief Allocates the (potentially remote) Realm instances for all of the
+ * values in \p g, excluding the preallocated values in \p preallocated,
+ * using \ref perform_instance_allocation_for_value.
+ *
+ * \relates TensorInstanceBacking
+ */
+TensorInstanceBacking perform_instance_allocation(
+    DynamicOpenDataflowGraph const &g,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &ctx);
+
+/**
+ * @brief Destroys all of the instances held in \p instances.
+ */
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance.h
new file mode 100644
index 0000000000..c615244722
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/pcg_instance.h
@@ -0,0 +1,139 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "utils/units/milliseconds_t.h"
+#include <optional>
+
+namespace FlexFlow {
+
+/**
+ * \brief The main public interface for the Realm backend.
+ * Takes a \ref MappedParallelComputationGraph and lowers it through
+ * \ref DynamicOpenDataflowGraph to get the fully-specified execution order of
+ * tasks to be issued. (Note: this is a parallel execution so execution order
+ * may not match the order in which operations are issued.) Also tracks the
+ * allocation of realm instances for tensors through its \ref
+ * TensorInstanceBacking.
+ *
+ * \note \ref PCGInstance is primarily just a container for the various structs
+ * held inside it. The actual initialization and training iteration
+ * functionality is held in \ref create_pcg_instance and \ref
+ * perform_update_pass_for_pcg_instance, respectively.
+ *
+ */
+struct PCGInstance {
+public:
+  PCGInstance() = delete;
+  PCGInstance(PCGInstance const &) = delete;
+  PCGInstance(PCGInstance &&) = delete;
+
+  explicit PCGInstance(
+      RealmContext &ctx,
+      std::vector<DynamicNodeInvocation> const &execution_order,
+      TensorInstanceBacking const &tensor_instance_backing,
+      PerDeviceOpStateBacking const &device_state_backing,
+      OptimizerAttrs const &optimizer_attrs,
+      std::optional<Realm::RegionInstance> logit_grad_tensor);
+
+  ~PCGInstance();
+
+  void update_optimizer_attrs_for_next_iter();
+
+  /** \name Getters **/
+  ///\{
+  RealmContext &get_realm_context();
+  std::vector<DynamicNodeInvocation> const &get_execution_order() const;
+  TensorInstanceBacking const &get_tensor_instance_backing() const;
+  PerDeviceOpStateBacking const &get_device_state_backing() const;
+  OptimizerAttrs const &get_optimizer_attrs() const;
+  std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
+  ///\}
+
+private:
+  RealmContext &ctx;
+  std::vector<DynamicNodeInvocation> execution_order;
+  TensorInstanceBacking tensor_instance_backing;
+  PerDeviceOpStateBacking device_state_backing;
+  OptimizerAttrs optimizer_attrs;
+  std::optional<Realm::RegionInstance> logit_grad_tensor;
+};
+
+/**
+ * \brief Creates a \ref PCGInstance. Should generally be used instead of \ref
+ * PCGInstance::PCGInstance.
+ *
+ * \relates PCGInstance
+ */
+PCGInstance create_pcg_instance(
+    RealmContext &ctx,
+    MappedParallelComputationGraph const &mpcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    ProfilingSettings const &profiling_settings,
+    DistributedFfHandle const &ff_handle,
+    FFIterationConfig const &iteration_config);
+
+/**
+ * \brief Dispatch a training iteration for a \ref PCGInstance.
+ *
+ * To dispatch just a piece of a training iteration, see the following
+ * functions:
+ * - \ref perform_forward_pass_for_pcg_instance
+ * - \ref perform_backward_pass_for_pcg_instance
+ * - \ref perform_update_pass_for_pcg_instance
+ *
+ * \relates PCGInstance
+ */
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &ff_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &ff_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &ff_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &ff_handle,
+        FFIterationConfig iteration_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
new file mode 100644
index 0000000000..89feb11905
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
@@ -0,0 +1,21 @@
+namespace = "FlexFlow"
+name = "PerDeviceOpStateBacking"
+type = "struct"
+features = []
+docstring = '''
+\brief Maps each shard-expanded DynamicNodeInvocation to its corresponding PerDeviceOpState.
+
+PerDeviceOpStateBacking is to PerDeviceOpState as DistributedDeviceHandle is to \ref device_handle_t (i.e., FFHandle).
+'''
+
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/device_specific_ptr.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicNodeInvocation, ::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
new file mode 100644
index 0000000000..814132d355
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
+
+#ifdef FF_USE_PREALM
+#include <realm/prealm/prealm.h>
+#else
+#include <realm.h>
+#endif
+
+namespace FlexFlow {
+
+#ifdef FF_USE_PREALM
+namespace Realm = ::PRealm;
+#else
+namespace Realm = ::Realm;
+#endif
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
new file mode 100644
index 0000000000..77af4a742c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -0,0 +1,46 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include "realm-execution/realm.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief An IAllocator instance that performs/manages each allocation as a
+ * \ref realm-instance "Realm Instance".
+ *
+ * \note As with the other instances of IAllocator, you generally want to use
+ * \ref get_realm_allocator rather than explicitly calling the constructor of
+ * RealmAllocator.
+ */
+struct RealmAllocator : public IAllocator {
+  explicit RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+
+  RealmAllocator() = delete;
+  RealmAllocator(RealmAllocator const &) = delete;
+  RealmAllocator(RealmAllocator &&) = delete;
+  ~RealmAllocator() override;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
+private:
+  Realm::Processor processor;
+  Realm::Memory memory;
+  std::unordered_map<void *, Realm::RegionInstance> ptr_instances;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmAllocator);
+
+/**
+ * \brief Creates a RealmAllocator instance as an Allocator.
+ *
+ * \relates RealmAllocator
+ */
+Allocator get_realm_allocator(Realm::Processor processor, Realm::Memory memory);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
new file mode 100644
index 0000000000..0d0b412130
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -0,0 +1,105 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include <optional>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+/**
+ * @brief An interface that wraps the rest of Realm and protects against certain
+ * classes of bugs, such as shutdown bugs.
+ *
+ * @warning Do NOT call Realm directly unless you know what you are doing.
+ */
+struct RealmContext {
+public:
+  RealmContext(Realm::Processor processor);
+  virtual ~RealmContext();
+
+  RealmContext() = delete;
+  RealmContext(RealmContext const &) = delete;
+  RealmContext(RealmContext &&) = delete;
+
+  /** \name Device mapping */
+  ///\{
+  Realm::Processor
+      map_device_coord_to_processor(MachineSpaceCoordinate const &);
+  static Realm::Memory get_nearest_memory(Realm::Processor);
+  ///\}
+
+  /** \name Current device context */
+  ///\{
+  Realm::Processor get_current_processor() const;
+  Allocator &get_current_device_allocator();
+  device_id_t get_current_device_idx() const;
+  ///\}
+
+  /** \name Task creation */
+  ///\{
+  Realm::Event spawn_task(Realm::Processor proc,
+                          task_id_t task_id,
+                          void const *args,
+                          size_t arglen,
+                          Realm::ProfilingRequestSet const &requests,
+                          Realm::Event wait_on = Realm::Event::NO_EVENT,
+                          int priority = 0);
+
+  Realm::Event
+      collective_spawn_task(Realm::Processor target_proc,
+                            task_id_t task_id,
+                            void const *args,
+                            size_t arglen,
+                            Realm::Event wait_on = Realm::Event::NO_EVENT,
+                            int priority = 0);
+  ///\}
+
+  /** \name Instance management */
+  ///\{
+  std::pair<Realm::RegionInstance, Realm::Event>
+      create_instance(Realm::Memory memory,
+                      TensorShape const &shape,
+                      Realm::ProfilingRequestSet const &prs,
+                      Realm::Event wait_on = Realm::Event::NO_EVENT);
+  ///\}
+
+  /**
+   * \brief Get the current set of outstanding events
+   */
+  Realm::Event get_outstanding_events();
+
+protected:
+  /**
+   * \brief Compact **and clear** the outstanding event queue
+   *
+   * \warning **User must block** on event or else use it, or it **will be
+   * lost** (potentially resulting in a shutdown hang).
+   */
+  [[nodiscard]] Realm::Event merge_outstanding_events();
+
+  void discover_machine_topology();
+
+  static std::optional<ManagedPerDeviceFFHandle>
+      make_device_handle_for_processor(Realm::Processor processor);
+
+protected:
+  Realm::Runtime runtime;
+  Realm::Processor processor;
+  Allocator allocator;
+  std::vector<Realm::Event> outstanding_events;
+  std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
+                     std::vector<Realm::Processor>>
+      processors;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
new file mode 100644
index 0000000000..287218749e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -0,0 +1,42 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Manages the initialization and shutdown of the Realm runtime.
+ * Provides the interface to launch the \ref term-controller that runs the rest
+ * of the computation (i.e., \ref RealmManager::start_controller).
+ */
+struct RealmManager : private RealmContext {
+public:
+  RealmManager(int *argc, char ***argv);
+  virtual ~RealmManager();
+
+  RealmManager() = delete;
+  RealmManager(RealmManager const &) = delete;
+  RealmManager(RealmManager &&) = delete;
+
+  /**
+   * @brief Launches the the \ref term-controller. Currently there is exactly
+   * one controller for the entire machine. The controller may be a function
+   * that closes over data (i.e., a lambda).
+   *
+   * @warning If the provided function closes over data, **the user must block
+   * on the resulting event** to ensure it remains in scope until the controller
+   * completes.
+   */
+  [[nodiscard]] Realm::Event
+      start_controller(std::function<void(RealmContext &)>,
+                       Realm::Event wait_on = Realm::Event::NO_EVENT);
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
new file mode 100644
index 0000000000..7219c5c07f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief A stub function to work around Realm not allowing lambdas to be be
+ * registered as Realm tasks. Takes the desired lambda to run as the \ref
+ * term-controller as an argument and immediately calls it.
+ */
+void controller_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Dispatches the \ref term-controller task. Packages up the provided \c
+ * std::function and passes it along to \ref controller_task_body.
+ */
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
new file mode 100644
index 0000000000..f6a07e97d4
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_return_task.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief The function registered as a Realm task for returning the
+ * asynchronously-initialized \ref PerDeviceFFHandle. Dispatched by \ref
+ * spawn_ff_handle_init_return_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+void ff_handle_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Launches the task (\ref ff_handle_init_return_task_body) for returning
+ * the asynchronously-initialized \ref PerDeviceFFHandle.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+Realm::Event spawn_ff_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
new file mode 100644
index 0000000000..64384b6ae6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task.h
@@ -0,0 +1,38 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_FF_HANDLE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_FF_HANDLE_INIT_TASK_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief The function registered as a Realm task for starting the asynchronous
+ * initialization of the \ref PerDeviceFFHandle. Dispatched by \ref
+ * spawn_ff_handle_init_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+void ff_handle_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Launches the task (\ref ff_handle_init_return_task_body) for starting
+ * the asynchronous initialization of the \ref PerDeviceFFHandle.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+Realm::Event spawn_ff_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..808a350091
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/ff_handle_init_task_args.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "FfHandleInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/realm.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/index.dox b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
new file mode 100644
index 0000000000..910488a863
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/index.dox
@@ -0,0 +1,36 @@
+namespace FlexFlow {
+/**
+
+\page realm-execution-tasks tasks/
+
+\c realm-execution groups tasks into four kinds (\ref tasks-controller-tasks, \ref tasks-op-task, \ref tasks-ffhandle-init, and \ref tasks-op-state-init), each which is implemented using one of two patterns (\ref tasks-one-part or \ref tasks-two-part).
+
+\section tasks-one-part Individual Tasks
+
+Invidividual tasks are just normal Realm tasks, which are implemented in \ref realm-execution as a
+wrapper function for spawning a task (e.g., \ref collective_spawn_controller_task) and a task body which is the actual Realm task implementation (e.g., \ref controller_task_body). Each also has an optional corresponding <em>TaskArgument</em> (e.g., \ref OpTaskArgs) object to provide a structure to the arguments passed from the wrapper to the task body. In cases where the %TaskArgument object is not trivially JSON-serializable, a corresponding JSON-serializable task argument type is provided (e.g., \ref SerializableOpTaskArgs).
+
+\subsection tasks-controller-tasks Controller Tasks
+
+Runs the \ref term-controller. The current implementation uses exactly one controller (i.e., centralized control), but the intention is to expand this in the future to distributed control (one controller per node, or one per device). Implemented in \ref controller_task.h.
+
+\subsection tasks-op-task Operator Tasks
+
+Implements all of the operator tasks, i.e., the tasks that are executed during training (i.e., forward, backward, update/optimizer, and loss tasks). Implented in \ref op_task.h.
+
+\section tasks-two-part Paired Tasks
+
+The other two types of tasks are implemented as pairs of tasks: one to begin initializing a value (e.g., \ref spawn_ff_handle_init_task), and another to return the initialized value when it's ready (e.g., \ref spawn_ff_handle_init_return_task). As with \ref tasks-one-part, they have an optional corresponding tasks argument type and a potential serializable task argument type.
+
+The paired task structure is required because Realm tasks do not return. Spawning a Realm task returns a completion event, but the event does not encode any information (other than that the task is finished). Thus, to return a value to the caller, a second task is required to send the value back, and the caller must block for this task to complete to ensure that the data is available before proceeding.
+
+\subsection tasks-ffhandle-init FFHandle Initialization Tasks
+
+For initializing the \ref PerDeviceFFHandle for each GPU. Implemented in \ref ff_handle_init_task.h and \ref ff_handle_init_return_task.h.
+
+\subsection tasks-op-state-init PerDeviceOpState Initialization Tasks
+
+For initializing the \ref PerDeviceOpState for each shard of an operator task. Implemented in \ref per_device_op_state_init_task.h and \ref per_device_op_state_init_return_task.h.
+
+*/
+}
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
new file mode 100644
index 0000000000..4aa0329a96
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -0,0 +1,69 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+/**
+ * \brief The function registered as a Realm task for operator-related tasks.
+ * Dispatched by \ref spawn_op_task.
+ */
+void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Launches the task (\ref op_task_body), for a \ref
+ * DynamicNodeInvocation using Realm.
+ *
+ * The task launch process functions a bit differently to that used in the
+ * previous FlexFlow codebase. Rather than having a function registered with
+ * realm/legion for every \ref task_id_t, we now have only a few functions
+ * registered: \ref op_task_body, \ref ff_handle_init_task_body,
+ * \ref per_device_op_state_init_return_task_body, and \ref controller_task_body
+ * (see \ref register_all_tasks for where this list comes from), and in fact
+ * only \ref op_task_body is launched by \ref spawn_op_task. Each of these
+ * registered tasks use the serialized arguments sent to them to dispatch to the
+ * correct implementatin in task-spec: for example, if we are trying to launch
+ * the task for a Conv2d operator, this function will actually dispatch a call
+ * to \ref op_task_body with a serialized \ref OpTaskArgs as an argument, and
+ * then \ref op_task_body will deserialize the argument, determine that we are
+ * trying to launch the forward pass of Conv2d, use \ref
+ * execute_dynamic_node_invocation (which then uses \ref call_fwd_task_impl) to
+ * actually call the function in lib/task-spec/src/task-spec/ops/impl/conv_2d.cc
+ *
+ * The above also means that we don't have a separate
+ * \ref ITaskArgumentAccessor subclass for realm-execution. Instead we ship over
+ * the information on the corresponding realm instances over to the remote node,
+ * grab the corresponding pointer/\ref GenericTensorAccessor, and then use
+ * \ref LocalTaskArgumentAccessor for the actual argument access as, by this
+ * point, everything is local.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
new file mode 100644
index 0000000000..f6bb83fbca
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -0,0 +1,43 @@
+namespace = "FlexFlow"
+name = "OpTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/device_specific_ptr.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::TensorInstanceBacking"
+
+[[fields]]
+name = "device_state"
+type = "std::optional<::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
new file mode 100644
index 0000000000..46a4bab727
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_return_task.h
@@ -0,0 +1,38 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_RETURN_TASK_H
+
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief The function registered as a Realm task for returning the
+ * asynchronously-initialized \ref PerDeviceOpState. Dispatched by \ref
+ * spawn_per_device_op_state_init_return_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+void per_device_op_state_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Launches the task (\ref per_device_op_state_init_return_task_body) for
+ * returning the asynchronously-initialized \ref PerDeviceOpState.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+Realm::Event spawn_per_device_op_state_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
new file mode 100644
index 0000000000..95b768a245
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_PER_DEVICE_OP_STATE_INIT_TASK_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief The function registered as a Realm task for starting the asynchronous
+ * initialization of the \ref PerDeviceOpState. Dispatched by \ref
+ * spawn_per_device_op_state_init_task.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+void per_device_op_state_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+/**
+ * \brief Launches the task (\ref per_device_op_state_init_task_body) for
+ * starting the asynchronous initialization of the \ref PerDeviceOpState.
+ *
+ * To understand how this fits into the broader structure of \ref
+ * realm-execution, see \ref realm-execution-tasks.
+ */
+std::optional<Realm::Event> spawn_per_device_op_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..57012ce716
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.toml
@@ -0,0 +1,48 @@
+namespace = "FlexFlow"
+name = "PerDeviceOpStateInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
+  "realm-execution/realm.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "TensorInstanceBacking"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificPtr<PerDeviceOpState> *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..9d7414aac6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.toml
@@ -0,0 +1,29 @@
+namespace = "FlexFlow"
+name = "SerializableFfHandleInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
new file mode 100644
index 0000000000..0d63d3610c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/ff_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableFfHandleInitTaskArgs
+    ff_handle_init_task_args_to_serializable(FfHandleInitTaskArgs const &);
+
+FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
+    SerializableFfHandleInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
new file mode 100644
index 0000000000..adac6631ee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -0,0 +1,51 @@
+namespace = "FlexFlow"
+name = "SerializableOpTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
+
+[[fields]]
+name = "device_state"
+type = "std::optional<::FlexFlow::SerializableDeviceSpecificPtr>"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
new file mode 100644
index 0000000000..3b2d05d0b6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &);
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..0e53767862
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.toml
@@ -0,0 +1,52 @@
+namespace = "FlexFlow"
+name = "SerializablePerDeviceOpStateInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
new file mode 100644
index 0000000000..62454d168f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializablePerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_to_serializable(
+        PerDeviceOpStateInitTaskArgs const &);
+PerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_from_serializable(
+        SerializablePerDeviceOpStateInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
new file mode 100644
index 0000000000..a956d53643
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief Registers a function as a Realm task.
+ *
+ * \warning The event returned by this function <em>must be consumed</em> or
+ * else Realm may not shut down properly.
+ */
+[[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
+                                         task_id_t func_id,
+                                         void (*task_body)(void const *,
+                                                           size_t,
+                                                           void const *,
+                                                           size_t,
+                                                           Realm::Processor));
+
+/**
+ * \brief Registers all known tasks (using \ref register_task).
+ *
+ * \warning The event returned by this function <em>must be consumed</em> or
+ * else Realm may not shut down properly.
+ */
+[[nodiscard]] Realm::Event register_all_tasks();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
new file mode 100644
index 0000000000..07cf61f7e1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceSpecificPtr"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "pcg/device_id_t.dtg.h",
+  "cstdint",
+  "optional",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "device_idx"
+type = "::FlexFlow::device_id_t"
+
+[[fields]]
+name = "ptr"
+type = "std::optional<uintptr_t>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
new file mode 100644
index 0000000000..726aef84ba
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
+
+namespace FlexFlow {
+
+template <typename T>
+SerializableDeviceSpecificPtr device_specific_ptr_to_serializable(
+    DeviceSpecificPtr<T> const &device_specific) {
+  return SerializableDeviceSpecificPtr{
+      /*device_idx=*/device_specific.get_device_idx(),
+      /*ptr=*/
+      transform(device_specific.get_unsafe_raw_ptr(),
+                [](T *ptr) { return reinterpret_cast<uintptr_t>(ptr); }),
+  };
+}
+
+template <typename T>
+DeviceSpecificPtr<T> device_specific_ptr_from_serializable(
+    SerializableDeviceSpecificPtr const &device_specific) {
+  return DeviceSpecificPtr<T>{
+      /*device_idx*/ device_specific.device_idx,
+      /*ptr=*/transform(device_specific.ptr, [](uintptr_t ptrval) {
+        return reinterpret_cast<T *>(ptrval);
+      })};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
new file mode 100644
index 0000000000..3217d58608
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmEvent"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Event::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
new file mode 100644
index 0000000000..ae1f1e8265
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &);
+Realm::Event realm_event_from_serializable(SerializableRealmEvent const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
new file mode 100644
index 0000000000..5b70c6888b
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "SerializableRealmInstance"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+src_includes = [
+  "utils/fmt/vector.h",
+  "utils/hash/vector.h",
+]
+
+[[fields]]
+name = "instance"
+# Realm::RegionInstance has hidden fields in PRealm so we need to encode it as bytes
+type = "std::vector<uint8_t>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
new file mode 100644
index 0000000000..7262ec4f09
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &);
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
new file mode 100644
index 0000000000..3cb64d95c1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmProcessor"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Processor::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
new file mode 100644
index 0000000000..6b29b6e223
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &);
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..75a796b2ee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "SerializableTensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/tasks/serializer/serializable_realm_event.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/pair.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, std::pair<::FlexFlow::SerializableRealmInstance, ::FlexFlow::SerializableRealmEvent>>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
new file mode 100644
index 0000000000..b536972b40
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking
+    tensor_instance_backing_to_serializable(TensorInstanceBacking const &);
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
new file mode 100644
index 0000000000..3208368d2d
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+
+#include <nlohmann/json.hpp>
+#include <string>
+#include <string_view>
+
+namespace FlexFlow {
+
+template <typename T>
+std::string serialize_task_args(T const &args) {
+  nlohmann::json j = args;
+  return j.dump();
+}
+
+template <typename T>
+T deserialize_task_args(void const *args, size_t arglen) {
+  nlohmann::json j = nlohmann::json::parse(
+      std::string_view{reinterpret_cast<char const *>(args), arglen});
+  return j.get<T>();
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
similarity index 88%
rename from lib/task-spec/include/task-spec/task_id_t.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index ce2de52d40..b1e5e07e28 100644
--- a/lib/task-spec/include/task-spec/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -7,12 +7,31 @@ features = [
   "rapidcheck",
   "json",
 ]
+docstring = '''
+\brief An enum for identifying tasks for use in the Realm runtime.
+
+\note Many of these are pulled over from the old FlexFlow codebase and are no
+longer in use. Eventually these should be pruned down to the set of tasks we're
+actually using.
+
+\note @ref task_id_t is used by the realm runtime (i.e., \ref realm-execution),
+but not by realm directly: realm-execution uses \ref
+get_realm_task_id_for_task_id to convert every \ref task_id_t into a
+Realm::Processor::TaskFuncID, which is what is actually used for task launches,
+etc.
+'''
+
+[[values]]
+name = "CONTROLLER_TASK_ID"
+
+[[values]]
+name = "DEVICE_HANDLE_INIT_TASK_ID"
 
 [[values]]
-name = "TOP_LEVEL_TASK_ID"
+name = "DEVICE_HANDLE_INIT_RETURN_TASK_ID"
 
 [[values]]
-name = "FF_INIT_TASK_ID"
+name = "DEVICE_STATE_INIT_RETURN_TASK_ID"
 
 [[values]]
 name = "IMAGE_INIT_TASK_ID"
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
new file mode 100644
index 0000000000..299df5cc3f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -0,0 +1,38 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
+
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+/**
+ * \brief Retrieves the \ref task_id_t for a \ref DynamicNodeAttrs, with
+ * a return value of \c std::nullopt to be treated as a no-op task.
+ */
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeAttrs const &,
+                       std::optional<OptimizerAttrs> const &);
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t>
+    get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
+
+/**
+ * \brief Convert a \ref FlexFlow::task_id_t into a Realm task ID.
+ */
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..1105af4a92
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -0,0 +1,31 @@
+namespace = "FlexFlow"
+name = "TensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  #"hash",
+]
+docstring = '''
+\brief A simple container for mapping between DynamicValueAttrs and the corresponding Realm instances (along with the ready event for each instance).
+
+\note The actual logic for doing instance allocation and destruction are in \ref perform_instance_allocation
+and \ref destroy_instances, respectively.
+'''
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/realm.h",
+  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "realm-execution/fmt/realm_event.h",
+  "realm-execution/fmt/realm_instance.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, std::pair<::FlexFlow::Realm::RegionInstance, ::FlexFlow::Realm::Event>>"
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
new file mode 100644
index 0000000000..93e525a349
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief Make an empty TensorInstanceBacking.
+ *
+ * \relates TensorInstanceBacking
+ */
+TensorInstanceBacking make_empty_tensor_instance_backing();
+
+/**
+ * \brief Get the subset of the given TensorInstanceBacking necessary to execute
+ * the given DynamicNodeInvocation.
+ */
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &, DynamicNodeInvocation const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/index.dox b/lib/realm-execution/index.dox
new file mode 100644
index 0000000000..351d6b2d21
--- /dev/null
+++ b/lib/realm-execution/index.dox
@@ -0,0 +1,50 @@
+namespace FlexFlow {
+/**
+
+\page realm-execution realm-execution
+
+\brief Executes distributed \ref MappedParallelComputationGraph ""s using Realm, primarily by lowering them to distributed \ref DynamicOpenDataflowGraph ""s using \ref task-spec.
+
+The Realm backend for distributed execution.
+
+This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
+
+\section realm-execution-usage Example Usage
+
+\snippet{local} lib/realm-execution/test/src/realm-execution/test_e2e.cc realm-execution example
+
+\section realm-execution-major-components Major Components
+
+- \ref PCGInstance "": \copybrief PCGInstance
+- \ref RealmManager "": \copybrief RealmManager
+- \ref RealmContext "": \copybrief RealmContext
+- \subpage realm-execution-tasks "include/realm-execution/tasks": The Realm task implementations and their supporting infrastructure.
+  - \ref "lib/realm-execution/include/realm-execution/tasks/impl" "impl/": the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
+  - \ref lib/realm-execution/include/realm-execution/tasks/serializer/ "serializer/": additional support for serializing Realm data types.
+  - \ref "realm_task_registry.h": Manages the registration of Realm tasks. All Realm tasks go through this interface.
+  - \ref "task_id_t.h": Type (\ref task_id_t) to represent Realm tasks, along with an encoding to Realm's native task ID type.
+- Helper components (mainly used within \ref PCGInstance)
+  - \ref "DistributedFfHandle": represents a distributed \ref PerDeviceFFHandle (i.e., a \ref PerDeviceFFHandle on each of the GPUs in the machine), for convenience.
+  - \ref DependencySet "": tracks dependencies during execution of tasks.
+  - \ref "distributed_per_device_op_state_initialization.h": performs distributed initialization of \ref "PerDeviceOpState"s and packages the results into a \ref PerDeviceOpStateBacking.
+  - \ref "instance_allocation.h": allocates instances for tensors in the dynamic graph and returns the resulting \ref TensorInstanceBacking.
+
+\section realm-execution-todo Outstanding TODOs
+
+- external instances
+- copies
+- task fusion
+- parallel operator implementation (partition, reduce, gather, etc.)
+- and fused parallel operators (reduce + broadcast = allreduce)
+- memory-optimizing compiler integration (tensor creation/destruction, tensor reuse)
+- control replication
+- Realm subgraphs
+
+\section terminology Terminology
+
+\subsection term-controller controller
+
+The main thread/function that, in an non-controlled-replicated implementation, processes the task graph and dispatches all of the tasks. In the future this will be extend to operate in a distributed fashion.
+
+*/
+}
diff --git a/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
new file mode 100644
index 0000000000..ba4fcc5a9f
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
@@ -0,0 +1,27 @@
+#include "realm-execution/atomic_dependency_set.h"
+
+namespace FlexFlow {
+
+AtomicDependencySet::AtomicDependencySet(Realm::Event precondition)
+    : writer(precondition) {}
+
+void AtomicDependencySet::add_writer(Realm::Event writer) {
+  this->writer =
+      Realm::Event::merge_events(writer, this->get_dependency_for_writer());
+  this->readers.clear();
+}
+
+void AtomicDependencySet::add_reader(Realm::Event reader) {
+  this->readers.push_back(reader);
+}
+
+Realm::Event AtomicDependencySet::get_dependency_for_writer() const {
+  Realm::Event readers = Realm::Event::merge_events(this->readers);
+  return Realm::Event::merge_events(this->writer, readers);
+}
+
+Realm::Event AtomicDependencySet::get_dependency_for_reader() const {
+  return this->writer;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dependency_set.cc b/lib/realm-execution/src/realm-execution/dependency_set.cc
new file mode 100644
index 0000000000..84412a125d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dependency_set.cc
@@ -0,0 +1,49 @@
+#include "realm-execution/dependency_set.h"
+#include "realm-execution/atomic_dependency_set.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+DependencySet::DependencySet(Realm::Event precondition)
+    : precondition(precondition) {}
+
+void DependencySet::add_writer(DynamicValueAttrs const &value,
+                               Realm::Event writer) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_writer(writer);
+}
+
+void DependencySet::add_reader(DynamicValueAttrs const &value,
+                               Realm::Event reader) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_reader(reader);
+}
+
+Realm::Event DependencySet::get_dependency_for_writer(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_dependency_for_writer();
+  }
+  return this->precondition;
+}
+
+Realm::Event DependencySet::get_dependency_for_reader(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_dependency_for_reader();
+  }
+  return this->precondition;
+}
+
+AtomicDependencySet &
+    DependencySet::get_atomic_dependency_set(DynamicValueAttrs const &value) {
+  if (!contains_key(this->atomic_dependencies, value)) {
+    this->atomic_dependencies.insert(
+        {value, AtomicDependencySet{this->precondition}});
+  }
+  return this->atomic_dependencies.at(value);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..ae9fc669d3
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -0,0 +1,22 @@
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "kernels/device_handle_t.h"
+#include "utils/containers/transform.h"
+#include "utils/json/optional.h"
+#include <cstdint>
+
+namespace FlexFlow {
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &device_id,
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  return DeviceSpecificManagedPerDeviceFFHandle{device_id, managed_handle};
+}
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
+    device_id_t device_idx) {
+  return device_handle_t_from_managed_handle_ptr(
+      device_specific.get(device_idx));
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
new file mode 100644
index 0000000000..986401956a
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_ff_handle.cc
@@ -0,0 +1,51 @@
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "task-spec/device_specific.h"
+
+namespace FlexFlow {
+
+DistributedFfHandle::DistributedFfHandle(
+    std::unordered_map<Realm::Processor,
+                       DeviceSpecificManagedPerDeviceFFHandle> const &handles)
+    : handles(handles) {}
+
+DeviceSpecificManagedPerDeviceFFHandle const &
+    DistributedFfHandle::at(Realm::Processor processor) const {
+  return this->handles.at(processor);
+}
+
+DistributedFfHandle
+    create_distributed_ff_handle(RealmContext &ctx,
+                                 size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion,
+                                 Realm::Event precondition) {
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
+
+  // Allocate space for the result before launching any tasks
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    if (proc.kind() == Realm::Processor::LOC_PROC ||
+        proc.kind() == Realm::Processor::TOC_PROC) {
+      handles.insert({proc,
+                      make_device_specific_managed_handle(
+                          ctx.get_current_device_idx(), std::nullopt)});
+    }
+  }
+
+  for (auto &[proc, handle] : handles) {
+    spawn_ff_handle_init_task(ctx,
+                              proc,
+                              workSpaceSize,
+                              allowTensorOpMathConversion,
+                              &handle,
+                              precondition);
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  return DistributedFfHandle{handles};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
new file mode 100644
index 0000000000..1e02fcf5d5
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc
@@ -0,0 +1,84 @@
+#include "realm-execution/distributed_per_device_op_state_initialization.h"
+#include "local-execution/per_device_op_state_initialization.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+#include "utils/optional.h"
+#include <optional>
+#include <unordered_map>
+#include <utility>
+
+namespace FlexFlow {
+
+PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
+    RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
+    ProfilingSettings const &profiling_settings,
+    DistributedFfHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition) {
+
+  // Initialize all operators and save the per-device op state
+  ASSERT(no_nodes_are_initialized(dg));
+
+  std::unordered_map<DynamicNodeInvocation,
+                     DeviceSpecificPtr<PerDeviceOpState> *>
+      device_state_map;
+  for (DynamicNodeInvocation const &invocation : dg.invocations) {
+    Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+        assert_unwrap(invocation.node_attrs.device_coord));
+
+    TensorInstanceBacking tensor_backing =
+        subset_tensor_instance_backing_for_invocation(tensor_instance_backing,
+                                                      invocation);
+
+    DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr =
+        new DeviceSpecificPtr<PerDeviceOpState>{ctx.get_current_device_idx(),
+                                                std::nullopt};
+
+    std::optional<Realm::Event> completion_event =
+        spawn_per_device_op_state_init_task(ctx,
+                                            target_proc,
+                                            invocation,
+                                            tensor_backing,
+                                            profiling_settings,
+                                            device_handle.at(target_proc),
+                                            iteration_config,
+                                            optimizer_attrs,
+                                            device_state_ptr,
+                                            precondition);
+
+    if (completion_event.has_value()) {
+      device_state_map.insert(std::pair{invocation, device_state_ptr});
+    } else {
+      // Task doesn't require initialization, clean up and don't store result
+      delete device_state_ptr;
+    }
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  auto deref = [](DynamicNodeInvocation const &i,
+                  DeviceSpecificPtr<PerDeviceOpState> *const &p) {
+    return std::pair{i, *p};
+  };
+  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
+      result = transform(device_state_map, deref);
+
+  for (DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr :
+       values(device_state_map)) {
+    delete device_state_ptr;
+  }
+
+  return PerDeviceOpStateBacking{/*backing=*/result};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
new file mode 100644
index 0000000000..a2a40e3752
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
@@ -0,0 +1,64 @@
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/device_type.dtg.h"
+#include "task-spec/permissions.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+static DeviceType infer_device_type_from_memory_and_processor(
+    Realm::Memory inst_memory, Realm::Processor for_processor) {
+  DeviceType device_type;
+  switch (inst_memory.kind()) {
+    case Realm::Memory::SYSTEM_MEM:
+      // Only accessible on CPU
+      device_type = DeviceType::CPU;
+      break;
+    case Realm::Memory::GPU_FB_MEM:
+      // Only accessible on GPU
+      device_type = DeviceType::GPU;
+      break;
+    case Realm::Memory::Z_COPY_MEM: {
+      // Accessible on either CPU or GPU, so infer based on where we're trying
+      // to access from
+      switch (for_processor.kind()) {
+        case Realm::Processor::LOC_PROC:
+          device_type = DeviceType::CPU;
+          break;
+        case Realm::Processor::TOC_PROC:
+          device_type = DeviceType::GPU;
+          break;
+        default:
+          PANIC("Unexpected Realm Processor kind", for_processor.kind());
+      }
+    } break;
+    default:
+      PANIC("Unexpected Realm Memory kind", inst_memory.kind());
+  }
+  return device_type;
+}
+
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor) {
+  ready.wait();
+
+  DeviceType device_type = infer_device_type_from_memory_and_processor(
+      inst.get_location(), for_processor);
+
+  size_t expected_size =
+      int{get_piece_size_in_bytes(parallel_tensor_shape).unwrap_num_bytes()};
+  void *ptr = inst.pointer_untyped(/*offset=*/0, /*datalen=*/expected_size);
+  if (permissions == Permissions::RO) {
+    return DynamicTensorAccessor{GenericTensorAccessorR{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  } else {
+    return DynamicTensorAccessor{GenericTensorAccessorW{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_event.cc b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
new file mode 100644
index 0000000000..a5aed9481d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
@@ -0,0 +1,9 @@
+#include "realm-execution/fmt/realm_event.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
new file mode 100644
index 0000000000..301954f824
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/fmt/realm_instance.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
new file mode 100644
index 0000000000..4ef2919b10
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -0,0 +1,83 @@
+#include "realm-execution/instance_allocation.h"
+#include "local-execution/tensor_allocation.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/bidict/generate_bidict.h"
+#include "utils/containers/all_are_true.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/make.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/containers/values.h"
+#include "utils/exception.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+std::pair<Realm::RegionInstance, Realm::Event>
+    perform_instance_allocation_for_value(
+        MachineSpaceCoordinate const &device_coord,
+        DynamicValueAttrs const &value,
+        RealmContext &ctx) {
+  ASSERT(value.accessor == std::nullopt);
+
+  TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
+
+  Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
+  Realm::Memory memory = ctx.get_nearest_memory(proc);
+  return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
+}
+
+TensorInstanceBacking perform_instance_allocation(
+    DynamicOpenDataflowGraph const &g,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &ctx) {
+  ASSERT(no_tensors_are_allocated(g));
+  ASSERT(tensors_are_ready_for_allocation(g));
+  for (DynamicValueAttrs const &v : keys(preallocated)) {
+    ASSERT(v.accessor == std::nullopt);
+  }
+
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
+  auto allocate = [&](DynamicNodeAttrs const &n, DynamicValueAttrs const &v) {
+    if (contains_key(preallocated, v)) {
+      // FIXME: Attach external instance to existing allocation and use that
+      NOT_IMPLEMENTED();
+    } else {
+      if (!contains_key(result.backing, v)) {
+        MachineSpaceCoordinate device_coord = assert_unwrap(n.device_coord);
+        result.backing.insert(std::pair{
+            v, perform_instance_allocation_for_value(device_coord, v, ctx)});
+      }
+      return result.backing.at(v);
+    }
+  };
+
+  for (DynamicNodeInvocation const &invocation : g.invocations) {
+    for (DynamicValueAttrs const &input : values(invocation.inputs)) {
+      allocate(invocation.node_attrs, input);
+    }
+    for (DynamicValueAttrs const &output : values(invocation.outputs)) {
+      allocate(invocation.node_attrs, output);
+    }
+  }
+
+  return result;
+}
+
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition) {
+  for (auto const &[instance, ready] : values(instances.backing)) {
+    instance.destroy(Realm::Event::merge_events(precondition, ready));
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance.cc
new file mode 100644
index 0000000000..60d96eca49
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/pcg_instance.cc
@@ -0,0 +1,330 @@
+#include "realm-execution/pcg_instance.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/optimizer_attrs.h"
+#include "realm-execution/dependency_set.h"
+#include "realm-execution/distributed_per_device_op_state_initialization.h"
+#include "realm-execution/instance_allocation.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/loss_insertion.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "task-spec/dynamic_graph/pass_expansion.h"
+#include "task-spec/dynamic_graph/shard_expansion.h"
+#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
+#include "task-spec/dynamic_graph/update_insertion.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/try_at.h"
+#include "utils/containers/values.h"
+#include "utils/graph/digraph/algorithms/get_topological_ordering.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+PCGInstance::PCGInstance(
+    RealmContext &ctx,
+    std::vector<DynamicNodeInvocation> const &execution_order,
+    TensorInstanceBacking const &tensor_instance_backing,
+    PerDeviceOpStateBacking const &device_state_backing,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<Realm::RegionInstance> logit_grad_tensor)
+    : ctx(ctx), execution_order(execution_order),
+      tensor_instance_backing(tensor_instance_backing),
+      device_state_backing(device_state_backing),
+      optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
+
+PCGInstance::~PCGInstance() {
+  destroy_instances(this->tensor_instance_backing,
+                    ctx.get_outstanding_events());
+}
+
+RealmContext &PCGInstance::get_realm_context() {
+  return this->ctx;
+}
+
+std::vector<DynamicNodeInvocation> const &
+    PCGInstance::get_execution_order() const {
+  return this->execution_order;
+}
+
+TensorInstanceBacking const &PCGInstance::get_tensor_instance_backing() const {
+  return this->tensor_instance_backing;
+}
+
+PerDeviceOpStateBacking const &PCGInstance::get_device_state_backing() const {
+  return this->device_state_backing;
+}
+
+OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
+  return this->optimizer_attrs;
+}
+
+void PCGInstance::update_optimizer_attrs_for_next_iter() {
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+}
+
+std::optional<Realm::RegionInstance>
+    PCGInstance::get_loss_tensor_instance() const {
+  return this->logit_grad_tensor;
+}
+
+PCGInstance create_pcg_instance(
+    RealmContext &ctx,
+    MappedParallelComputationGraph const &mpcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    ProfilingSettings const &profiling_settings,
+    DistributedFfHandle const &device_handle,
+    FFIterationConfig const &iteration_config) {
+
+  DynamicOpenDataflowGraph dg =
+      make_dynamic_open_dataflow_graph_from_mpcg(mpcg);
+  dg = perform_pass_expansion(dg);
+
+  std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
+      input_tensors;
+  std::optional<DynamicValueAttrs> logit_grad_value;
+  if (loss_attrs) {
+    auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
+        dg,
+        assert_unwrap(loss_attrs),
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        loss_mapping);
+    dg = dg2;
+    logit_grad_value = logit_grad_v;
+    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+  }
+
+  dg = perform_update_insertion(dg, optimizer_attrs);
+  dg = perform_shard_expansion(dg);
+  TensorInstanceBacking tensor_instance_backing =
+      perform_instance_allocation(dg, inputs, ctx);
+
+  logit_grad_value =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        for (DynamicNodeInvocation const &invocation : dg.invocations) {
+          if (invocation.node_attrs.task_type != DynamicTaskType::LOSS) {
+            continue;
+          }
+          for (auto const &[slot, value] : invocation.outputs) {
+            if (slot.slot_name == TensorSlotName::LOGIT &&
+                value.tensor_guid == lgv.tensor_guid &&
+                value.role == lgv.role) {
+              return value;
+            }
+          }
+        }
+        PANIC("couldn't find updated logit grad in the shard-expanded dynamic "
+              "graph");
+      });
+
+  std::optional<Realm::RegionInstance> logit_grad_tensor =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        return tensor_instance_backing.backing.at(lgv).first;
+      });
+
+  PerDeviceOpStateBacking device_state_backing =
+      perform_distributed_per_device_op_state_initialization(
+          ctx,
+          dg,
+          tensor_instance_backing,
+          profiling_settings,
+          device_handle,
+          iteration_config,
+          optimizer_attrs,
+          ctx.get_outstanding_events());
+
+  // Compute the topological ordering of the graph
+  auto [kwarg_graph, node_map] =
+      labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph(dg);
+  std::vector<Node> node_topo_order = get_topological_ordering(kwarg_graph);
+  std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
+      node_topo_order, [&](Node node) { return node_map.at_l(node); });
+
+  return PCGInstance{/*ctx=*/ctx,
+                     /*execution_order=*/invocation_topo_order,
+                     /*tensor_instance_backing=*/tensor_instance_backing,
+                     /*device_state_backing=*/device_state_backing,
+                     /*optimizer_attrs=*/optimizer_attrs,
+                     /*logit_grad_tensor=*/logit_grad_tensor};
+}
+
+static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    execute_distributed_dynamic_node_invocation_set(
+        RealmContext &ctx,
+        std::vector<DynamicNodeInvocation> const &invocations,
+        TensorInstanceBacking const &tensor_instance_backing,
+        PerDeviceOpStateBacking const &device_state_backing,
+        OptimizerAttrs const &optimizer_attrs,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  // For simplicity we'll track a dependency on all outstanding operations up to
+  // this point. This will create an effective barrier between phases.
+  DependencySet dependency_set{ctx.get_outstanding_events()};
+  return unordered_map_from_pairs(
+      transform(invocations, [&](DynamicNodeInvocation const &invocation) {
+        TrainingOperationAttrs op_attrs =
+            assert_unwrap(invocation.node_attrs.op_attrs);
+        if (op_attrs.is_pcg_op() && (op_attrs.require_pcg_op().is_input() ||
+                                     op_attrs.require_pcg_op().is_weight())) {
+          return std::pair{invocation.node_attrs.layer_guid,
+                           Realm::Event::NO_EVENT};
+        }
+
+        std::vector<Realm::Event> input_dependencies =
+            transform(vector_of(values(invocation.inputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_reader(value);
+                      });
+        std::vector<Realm::Event> output_dependencies =
+            transform(vector_of(values(invocation.outputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_writer(value);
+                      });
+        Realm::Event dependencies = Realm::Event::merge_events(
+            Realm::Event::merge_events(input_dependencies),
+            Realm::Event::merge_events(output_dependencies));
+        Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+            assert_unwrap(invocation.node_attrs.device_coord));
+
+        TensorInstanceBacking tensor_backing =
+            subset_tensor_instance_backing_for_invocation(
+                tensor_instance_backing, invocation);
+
+        Realm::Event result =
+            spawn_op_task(ctx,
+                          target_proc,
+                          invocation,
+                          tensor_backing,
+                          try_at(device_state_backing.backing, invocation),
+                          profiling_settings,
+                          device_handle.at(target_proc),
+                          iteration_config,
+                          optimizer_attrs,
+                          dependencies);
+        for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+          dependency_set.add_reader(value, result);
+        }
+        for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+          dependency_set.add_writer(value, result);
+        }
+        return std::pair{invocation.node_attrs.layer_guid, result};
+      }));
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      pcg_instance.get_execution_order();
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/pcg_instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/iteration_config);
+  pcg_instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::FWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/pcg_instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::BWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/pcg_instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedFfHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::UPD;
+             });
+
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/pcg_instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/iteration_config);
+  pcg_instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
new file mode 100644
index 0000000000..194210cf5a
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -0,0 +1,64 @@
+#include "realm-execution/realm_allocator.h"
+#include "kernels/device.h"
+#include "pcg/device_type.dtg.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+RealmAllocator::RealmAllocator(Realm::Processor processor, Realm::Memory memory)
+    : processor(processor), memory(memory) {}
+
+RealmAllocator::~RealmAllocator() {
+  for (Realm::RegionInstance const &instance : values(this->ptr_instances)) {
+    instance.destroy(Realm::Event::NO_EVENT);
+  }
+}
+
+void *RealmAllocator::allocate(size_t requested_memory_size) {
+  Realm::Rect<1> bounds{Realm::Point<1>::ZEROES(),
+                        Realm::Point<1>{requested_memory_size} -
+                            Realm::Point<1>::ONES()};
+  std::vector<size_t> field_sizes{1};
+  Realm::RegionInstance inst;
+  Realm::Event ready =
+      Realm::RegionInstance::create_instance(inst,
+                                             this->memory,
+                                             bounds,
+                                             field_sizes,
+                                             0 /*SOA*/,
+                                             Realm::ProfilingRequestSet{});
+  ready.wait();
+  void *ptr =
+      inst.pointer_untyped(/*offset=*/0, /*datalen=*/requested_memory_size);
+  ASSERT(ptr);
+  this->ptr_instances.insert({ptr, inst});
+  return ptr;
+}
+
+void RealmAllocator::deallocate(void *ptr) {
+  ASSERT(contains_key(this->ptr_instances, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  this->ptr_instances.at(ptr).destroy(Realm::Event::NO_EVENT);
+  this->ptr_instances.erase(ptr);
+}
+
+DeviceType RealmAllocator::get_allocation_device_type() const {
+  switch (this->processor.kind()) {
+    case Realm::Processor::Kind::LOC_PROC:
+      return DeviceType::CPU;
+    case Realm::Processor::Kind::TOC_PROC:
+      return DeviceType::GPU;
+    default:
+      PANIC("Unhandled FwbTensorType", this->processor.kind());
+  }
+}
+
+Allocator get_realm_allocator(Realm::Processor processor,
+                              Realm::Memory memory) {
+  Allocator allocator = Allocator::create<RealmAllocator>(processor, memory);
+  return allocator;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
new file mode 100644
index 0000000000..4e981e7414
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -0,0 +1,253 @@
+#include "realm-execution/realm_context.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_handle_t.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/tensor_dims.dtg.h"
+#include "pcg/device_id_t.h"
+#include "pcg/device_type.dtg.h"
+#include "realm-execution/realm_allocator.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/one_to_many/one_to_many.h"
+#include "utils/positive_int/positive_int.h"
+
+namespace FlexFlow {
+
+RealmContext::RealmContext(Realm::Processor processor)
+    : processor(processor),
+      allocator(get_realm_allocator(
+          processor, RealmContext::get_nearest_memory(processor))) {}
+
+RealmContext::~RealmContext() {
+  if (!this->outstanding_events.empty()) {
+    Realm::Event outstanding = this->merge_outstanding_events();
+    outstanding.wait();
+  }
+}
+
+static std::tuple<Realm::AddressSpace, Realm::Processor::Kind, nonnegative_int>
+    convert_machine_space_coordinate(
+        MachineSpaceCoordinate const &device_coord) {
+  Realm::AddressSpace as = int{device_coord.node_idx};
+  Realm::Processor::Kind kind;
+  switch (device_coord.device_type) {
+    case DeviceType::CPU:
+      kind = Realm::Processor::Kind::LOC_PROC;
+      break;
+    case DeviceType::GPU:
+      kind = Realm::Processor::Kind::TOC_PROC;
+      break;
+    default:
+      PANIC("Unhandled DeviceType", fmt::to_string(device_coord.device_type));
+      break;
+  }
+  nonnegative_int proc_in_node = device_coord.device_idx;
+  return std::tuple{as, kind, proc_in_node};
+}
+
+Realm::Processor RealmContext::map_device_coord_to_processor(
+    MachineSpaceCoordinate const &device_coord) {
+  this->discover_machine_topology();
+  auto [as, kind, proc_in_node] =
+      convert_machine_space_coordinate(device_coord);
+  return this->processors.at(std::pair{as, kind}).at(int{proc_in_node});
+}
+
+Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) {
+  if (!proc.exists()) {
+    return Realm::Memory::NO_MEMORY;
+  }
+
+  // FIMXE: this isn't going to do what you expect until
+  // https://github.com/StanfordLegion/realm/pull/392 merges
+  Realm::Machine::MemoryQuery mq(Realm::Machine::get_machine());
+  mq.best_affinity_to(proc);
+  ASSERT(mq.count() > 0);
+  return mq.first();
+}
+
+Realm::Processor RealmContext::get_current_processor() const {
+  return this->processor;
+}
+
+Allocator &RealmContext::get_current_device_allocator() {
+  return this->allocator;
+}
+
+device_id_t RealmContext::get_current_device_idx() const {
+  Realm::Processor proc = this->get_current_processor();
+
+  // FIXME: find a more efficient way to implement this than scanning the
+  // machine every time
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  pq.same_address_space_as(proc);
+  nonnegative_int idx{0};
+  for (Realm::Processor p : pq) {
+    if (p == proc) {
+      break;
+    }
+    idx++;
+  }
+
+  switch (proc.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::CPU);
+    case Realm::Processor::TOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::GPU);
+    default:
+      PANIC("Unhandled Realm::ProcessorKind", fmt::to_string(int{proc.kind()}));
+  }
+}
+
+Realm::Event
+    RealmContext::spawn_task(Realm::Processor proc,
+                             task_id_t task_id,
+                             void const *args,
+                             size_t arglen,
+                             Realm::ProfilingRequestSet const &requests,
+                             Realm::Event wait_on,
+                             int priority) {
+  Realm::Event result = proc.spawn(get_realm_task_id_for_task_id(task_id),
+                                   args,
+                                   arglen,
+                                   requests,
+                                   wait_on,
+                                   priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+Realm::Event RealmContext::collective_spawn_task(Realm::Processor target_proc,
+                                                 task_id_t task_id,
+                                                 void const *args,
+                                                 size_t arglen,
+                                                 Realm::Event wait_on,
+                                                 int priority) {
+  Realm::Event result =
+      this->runtime.collective_spawn(target_proc,
+                                     get_realm_task_id_for_task_id(task_id),
+                                     args,
+                                     arglen,
+                                     wait_on,
+                                     priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+template <int N, typename T = int>
+static Realm::Rect<N, T> rect_from_dims(TensorDims const &dims) {
+  std::vector<int> values{dims.ff_ordered.begin(), dims.ff_ordered.end()};
+  ASSERT(values.size() == N);
+  return Realm::Rect<N, T>{Realm::Point<N, T>::ZEROES(),
+                           Realm::Point<N, T>{values.data()} -
+                               Realm::Point<N, T>::ONES()};
+}
+
+std::pair<Realm::RegionInstance, Realm::Event>
+    RealmContext::create_instance(Realm::Memory memory,
+                                  TensorShape const &shape,
+                                  Realm::ProfilingRequestSet const &prs,
+                                  Realm::Event wait_on) {
+  std::vector<size_t> field_sizes{
+      static_cast<size_t>(int{size_of_datatype(shape.data_type)})};
+  Realm::RegionInstance inst;
+  Realm::Event ready;
+  switch (shape.dims.ff_ordered.num_dims()) {
+#if REALM_MAX_DIM >= 1
+    case 1:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<1>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 2
+    case 2:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<2>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 3
+    case 3:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<3>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 4
+    case 4:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<4>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 5
+    case 5:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<5>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+    default:
+      PANIC("TensorShape dims greater than REALM_MAX_DIM",
+            fmt::to_string(shape.dims.ff_ordered.num_dims()));
+      break;
+  }
+  this->outstanding_events.push_back(ready);
+  return std::pair{inst, ready};
+}
+
+Realm::Event RealmContext::get_outstanding_events() {
+  Realm::Event result = this->merge_outstanding_events();
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+Realm::Event RealmContext::merge_outstanding_events() {
+  Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
+  this->outstanding_events.clear();
+  return result;
+}
+
+void RealmContext::discover_machine_topology() {
+  if (!this->processors.empty()) {
+    return;
+  }
+
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    Realm::AddressSpace as = proc.address_space();
+    Realm::Processor::Kind kind = proc.kind();
+    this->processors[std::pair{as, kind}].push_back(proc);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..fc74fffe5d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -0,0 +1,34 @@
+#include "realm-execution/realm_manager.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/realm_task_registry.h"
+
+namespace FlexFlow {
+
+RealmManager::RealmManager(int *argc, char ***argv)
+    : RealmContext(Realm::Processor::NO_PROC) {
+  bool ok = this->runtime.init(argc, argv);
+  ASSERT(ok);
+
+  // Register all tasks at initialization time so we don't need to later
+  register_all_tasks().wait();
+}
+
+RealmManager::~RealmManager() {
+  Realm::Event outstanding = this->merge_outstanding_events();
+  this->runtime.shutdown(outstanding);
+  this->runtime.wait_for_shutdown();
+}
+
+Realm::Event
+    RealmManager::start_controller(std::function<void(RealmContext &)> thunk,
+                                   Realm::Event wait_on) {
+  Realm::Processor target_proc =
+      Realm::Machine::ProcessorQuery(Realm::Machine::get_machine())
+          .only_kind(Realm::Processor::LOC_PROC)
+          .first();
+
+  return collective_spawn_controller_task(*this, target_proc, thunk, wait_on);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
new file mode 100644
index 0000000000..285e8acaa7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
@@ -0,0 +1,39 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+
+namespace FlexFlow {
+
+struct ControllerTaskArgs {
+public:
+  std::function<void(RealmContext &)> thunk;
+};
+
+void controller_task_body(void const *args,
+                          size_t arglen,
+                          void const *userdata,
+                          size_t userlen,
+                          Realm::Processor proc) {
+  ASSERT(arglen == sizeof(ControllerTaskArgs));
+  ControllerTaskArgs task_args =
+      *reinterpret_cast<ControllerTaskArgs const *>(args);
+
+  RealmContext ctx{proc};
+  task_args.thunk(ctx);
+}
+
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition) {
+  ControllerTaskArgs task_args;
+  task_args.thunk = thunk;
+
+  return ctx.collective_spawn_task(target_proc,
+                                   task_id_t::CONTROLLER_TASK_ID,
+                                   &task_args,
+                                   sizeof(task_args),
+                                   precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
new file mode 100644
index 0000000000..1a90052fa7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_return_task.cc
@@ -0,0 +1,51 @@
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+struct FfHandleInitReturnTaskArgs {
+public:
+  FfHandleInitReturnTaskArgs() = delete;
+  FfHandleInitReturnTaskArgs(
+      DeviceSpecificManagedPerDeviceFFHandle result,
+      Realm::Processor origin_proc,
+      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificManagedPerDeviceFFHandle result;
+  Realm::Processor origin_proc;
+  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
+};
+
+void ff_handle_init_return_task_body(void const *args,
+                                     size_t arglen,
+                                     void const *userdata,
+                                     size_t userlen,
+                                     Realm::Processor proc) {
+  ASSERT(arglen == sizeof(FfHandleInitReturnTaskArgs));
+  FfHandleInitReturnTaskArgs task_args =
+      *reinterpret_cast<FfHandleInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_ff_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    Realm::Event precondition) {
+  FfHandleInitReturnTaskArgs task_args{result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
new file mode 100644
index 0000000000..86d03e45f3
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/ff_handle_init_task.cc
@@ -0,0 +1,79 @@
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/tasks/impl/ff_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+static std::optional<ManagedPerDeviceFFHandle *>
+    make_ff_handle_for_processor(Realm::Processor processor,
+                                 size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion) {
+  switch (processor.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return std::nullopt;
+    case Realm::Processor::TOC_PROC:
+      return new ManagedPerDeviceFFHandle{initialize_multi_gpu_handle(
+          /*num_ranks=*/Realm::Machine::get_machine().get_address_space_count(),
+          /*my_rank=*/processor.address_space(),
+          /*workSpaceSize=*/workSpaceSize,
+          /*allowTensorOpMathConversion=*/allowTensorOpMathConversion)};
+    default:
+      PANIC("Unhandled Realm::ProcessorKind",
+            fmt::to_string(int{processor.kind()}));
+  }
+}
+
+void ff_handle_init_task_body(void const *args,
+                              size_t arglen,
+                              void const *userdata,
+                              size_t userlen,
+                              Realm::Processor proc) {
+  FfHandleInitTaskArgs task_args = ff_handle_init_task_args_from_serializable(
+      deserialize_task_args<SerializableFfHandleInitTaskArgs>(args, arglen));
+
+  RealmContext ctx{proc};
+  DeviceSpecificManagedPerDeviceFFHandle managed_handle =
+      make_device_specific_managed_handle(
+          ctx.get_current_device_idx(),
+          make_ff_handle_for_processor(proc,
+                                       task_args.workSpaceSize,
+                                       task_args.allowTensorOpMathConversion));
+
+  spawn_ff_handle_init_return_task(ctx,
+                                   task_args.origin_proc,
+                                   managed_handle,
+                                   task_args.origin_result_ptr,
+                                   Realm::Event::NO_EVENT);
+}
+
+Realm::Event spawn_ff_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    Realm::Event precondition) {
+
+  FfHandleInitTaskArgs task_args = FfHandleInitTaskArgs{
+      workSpaceSize,
+      allowTensorOpMathConversion,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
+
+  std::string serialized_args =
+      serialize_task_args(ff_handle_init_task_args_to_serializable(task_args));
+  return ctx.spawn_task(target_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                        serialized_args.data(),
+                        serialized_args.size(),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
new file mode 100644
index 0000000000..2eaec4d6ea
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -0,0 +1,98 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "local-execution/task_execution.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/per_device_op_state.dtg.h"
+#include "task-spec/per_device_op_state.h"
+#include "task-spec/permissions.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/optional.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+void op_task_body(void const *args,
+                  size_t arglen,
+                  void const *userdata,
+                  size_t userlen,
+                  Realm::Processor proc) {
+  OpTaskArgs task_args = op_task_args_from_serializable(
+      deserialize_task_args<SerializableOpTaskArgs>(args, arglen));
+
+  RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
+  execute_dynamic_node_invocation(
+      /*invocation=*/invocation,
+      /*allocator=*/ctx.get_current_device_allocator(),
+      /*profiling_settings=*/task_args.profiling_settings,
+      /*ff_handle=*/device_handle,
+      /*per_device_op_state=*/
+      transform(and_then(task_args.device_state,
+                         [&](DeviceSpecificPtr<PerDeviceOpState> const &d) {
+                           return d.get(ctx.get_current_device_idx());
+                         }),
+                [](PerDeviceOpState *ptr) { return *ptr; }),
+      /*iteration_config=*/task_args.iteration_config,
+      /*optimizer_attrs=*/task_args.optimizer_attrs,
+      /*device_idx=*/ctx.get_current_device_idx());
+}
+
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition) {
+
+  OpTaskArgs task_args = OpTaskArgs{
+      invocation,
+      tensor_backing,
+      device_state,
+      profiling_settings,
+      device_handle,
+      iteration_config,
+      optimizer_attrs,
+  };
+
+  std::string serialized_args =
+      serialize_task_args(op_task_args_to_serializable(task_args));
+
+  return ctx.spawn_task(
+      target_proc,
+      assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
+      serialized_args.data(),
+      serialized_args.size(),
+      Realm::ProfilingRequestSet{},
+      precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
new file mode 100644
index 0000000000..222ddb28b8
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_return_task.cc
@@ -0,0 +1,52 @@
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+struct PerDeviceOpStateInitReturnTaskArgs {
+public:
+  PerDeviceOpStateInitReturnTaskArgs() = delete;
+  PerDeviceOpStateInitReturnTaskArgs(
+      DeviceSpecificPtr<PerDeviceOpState> result,
+      Realm::Processor origin_proc,
+      DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificPtr<PerDeviceOpState> result;
+  Realm::Processor origin_proc;
+  DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr;
+};
+
+void per_device_op_state_init_return_task_body(void const *args,
+                                               size_t arglen,
+                                               void const *userdata,
+                                               size_t userlen,
+                                               Realm::Processor proc) {
+  ASSERT(arglen == sizeof(PerDeviceOpStateInitReturnTaskArgs));
+  PerDeviceOpStateInitReturnTaskArgs task_args =
+      *reinterpret_cast<PerDeviceOpStateInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_per_device_op_state_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
+    Realm::Event precondition) {
+  PerDeviceOpStateInitReturnTaskArgs task_args{
+      result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
new file mode 100644
index 0000000000..c5ff8f39be
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/per_device_op_state_init_task.cc
@@ -0,0 +1,117 @@
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
+#include "local-execution/per_device_op_state_initialization.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/per_device_op_state.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/optional.h"
+#include <optional>
+#include <type_traits>
+
+namespace FlexFlow {
+
+void per_device_op_state_init_task_body(void const *args,
+                                        size_t arglen,
+                                        void const *userdata,
+                                        size_t userlen,
+                                        Realm::Processor proc) {
+  PerDeviceOpStateInitTaskArgs task_args =
+      per_device_op_state_init_task_args_from_serializable(
+          deserialize_task_args<SerializablePerDeviceOpStateInitTaskArgs>(
+              args, arglen));
+
+  RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
+  DynamicNodeInvocation result_invocation =
+      initialize_node(invocation,
+                      ctx.get_current_device_allocator(),
+                      task_args.profiling_settings,
+                      device_handle,
+                      task_args.iteration_config,
+                      task_args.optimizer_attrs,
+                      ctx.get_current_device_idx());
+  DeviceSpecificPerDeviceOpState result_state =
+      assert_unwrap(result_invocation.node_attrs.per_device_op_state);
+  // Important: to make sure this doesn't get deallocated, we intentionally leak
+  // the allocation here
+  PerDeviceOpState *result_state_ptr =
+      new PerDeviceOpState{get_per_device_op_state_from_device_specific(
+          result_state, ctx.get_current_device_idx())};
+  DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
+      ctx.get_current_device_idx(), result_state_ptr};
+  spawn_per_device_op_state_init_return_task(ctx,
+                                             task_args.origin_proc,
+                                             result_device_specific,
+                                             task_args.origin_result_ptr,
+                                             Realm::Event::NO_EVENT);
+}
+
+std::optional<Realm::Event> spawn_per_device_op_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
+    Realm::Event precondition) {
+  PerDeviceOpStateInitTaskArgs task_args{
+      invocation,
+      tensor_backing,
+      profiling_settings,
+      device_handle,
+      iteration_config,
+      optimizer_attrs,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
+
+  std::optional<task_id_t> task_id =
+      and_then(and_then(invocation.node_attrs.op_attrs,
+                        [](TrainingOperationAttrs const &op_attrs) {
+                          return op_attrs.try_require_pcg_op();
+                        }),
+               get_init_task_id_for_op_attrs);
+  if (task_id.has_value()) {
+    std::string args = serialize_task_args(
+        per_device_op_state_init_task_args_to_serializable(task_args));
+    return ctx.spawn_task(target_proc,
+                          assert_unwrap(task_id),
+                          args.data(),
+                          args.size(),
+                          Realm::ProfilingRequestSet{},
+                          precondition);
+  }
+  return std::nullopt;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
new file mode 100644
index 0000000000..0aaa3dacae
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_ff_handle_init_task_args.cc
@@ -0,0 +1,27 @@
+#include "realm-execution/tasks/impl/serializable_ff_handle_init_task_args.h"
+
+namespace FlexFlow {
+
+SerializableFfHandleInitTaskArgs
+    ff_handle_init_task_args_to_serializable(FfHandleInitTaskArgs const &args) {
+  return SerializableFfHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+FfHandleInitTaskArgs ff_handle_init_task_args_from_serializable(
+    SerializableFfHandleInitTaskArgs const &args) {
+  return FfHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificManagedPerDeviceFFHandle *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
new file mode 100644
index 0000000000..32d54adc37
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -0,0 +1,41 @@
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
+  return SerializableOpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_to_serializable(args.tensor_backing),
+      /*device_state=*/
+      transform(args.device_state,
+                device_specific_ptr_to_serializable<PerDeviceOpState>),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
+  return OpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_from_serializable(args.tensor_backing),
+      /*device_state=*/
+      transform(args.device_state,
+                device_specific_ptr_from_serializable<PerDeviceOpState>),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
new file mode 100644
index 0000000000..7b52d9c03d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.cc
@@ -0,0 +1,45 @@
+#include "realm-execution/tasks/impl/serializable_per_device_op_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+
+namespace FlexFlow {
+
+SerializablePerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_to_serializable(
+        PerDeviceOpStateInitTaskArgs const &args) {
+  return SerializablePerDeviceOpStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_to_serializable(args.tensor_backing),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+PerDeviceOpStateInitTaskArgs
+    per_device_op_state_init_task_args_from_serializable(
+        SerializablePerDeviceOpStateInitTaskArgs const &args) {
+  return PerDeviceOpStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_from_serializable(args.tensor_backing),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificPtr<PerDeviceOpState> *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
new file mode 100644
index 0000000000..e7a8948f8d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -0,0 +1,163 @@
+#include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/ff_handle_init_task.h"
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_return_task.h"
+#include "realm-execution/tasks/impl/per_device_op_state_init_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+Realm::Event register_task(Realm::Processor::Kind target_kind,
+                           task_id_t func_id,
+                           void (*task_body)(void const *,
+                                             size_t,
+                                             void const *,
+                                             size_t,
+                                             Realm::Processor)) {
+  Realm::Processor::TaskFuncID realm_task_id =
+      get_realm_task_id_for_task_id(func_id);
+#ifdef FF_USE_PREALM
+  Realm::prealm_task_name(realm_task_id, fmt::to_string(func_id));
+#endif
+  return Realm::Processor::register_task_by_kind(
+      target_kind,
+      /*global=*/false,
+      realm_task_id,
+      Realm::CodeDescriptor(task_body),
+      Realm::ProfilingRequestSet());
+}
+
+Realm::Event register_all_tasks() {
+  std::vector<Realm::Event> pending_registrations;
+
+  std::vector<task_id_t> init_task_ids = {
+      // Init tasks
+      task_id_t::BATCHNORM_INIT_TASK_ID,
+      task_id_t::COMBINE_INIT_TASK_ID,
+      task_id_t::CONV2D_INIT_TASK_ID,
+      task_id_t::DROPOUT_INIT_TASK_ID,
+      task_id_t::ELEMENTBINARY_INIT_TASK_ID,
+      task_id_t::ELEMENTUNARY_INIT_TASK_ID,
+      task_id_t::GATHER_INIT_TASK_ID,
+      task_id_t::LAYERNORM_INIT_TASK_ID,
+      task_id_t::LINEAR_INIT_TASK_ID,
+      task_id_t::ATTENTION_INIT_TASK_ID,
+      task_id_t::POOL2D_INIT_TASK_ID,
+      task_id_t::REDUCE_INIT_TASK_ID,
+      task_id_t::REDUCTION_INIT_TASK_ID,
+      task_id_t::REPARTITION_INIT_TASK_ID,
+      task_id_t::REPLICATE_INIT_TASK_ID,
+      task_id_t::SOFTMAX_INIT_TASK_ID,
+  };
+
+  for (task_id_t task_id : init_task_ids) {
+    pending_registrations.push_back(
+        register_task(Realm::Processor::LOC_PROC,
+                      task_id,
+                      per_device_op_state_init_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::TOC_PROC,
+                      task_id,
+                      per_device_op_state_init_task_body));
+  }
+
+  std::vector<task_id_t> task_ids = {
+      // Forward tasks
+      task_id_t::BATCHMATMUL_FWD_TASK_ID,
+      task_id_t::BATCHNORM_FWD_TASK_ID,
+      task_id_t::BROADCAST_FWD_TASK_ID,
+      task_id_t::CAST_FWD_TASK_ID,
+      task_id_t::COMBINE_FWD_TASK_ID,
+      task_id_t::CONCAT_FWD_TASK_ID,
+      task_id_t::CONV2D_FWD_TASK_ID,
+      task_id_t::DROPOUT_FWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_FWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_FWD_TASK_ID,
+      task_id_t::EMBED_FWD_TASK_ID,
+      task_id_t::FLAT_FWD_TASK_ID,
+      task_id_t::GATHER_FWD_TASK_ID,
+      task_id_t::LAYERNORM_FWD_TASK_ID,
+      task_id_t::LINEAR_FWD_TASK_ID,
+      task_id_t::ATTENTION_FWD_TASK_ID,
+      task_id_t::POOL2D_FWD_TASK_ID,
+      task_id_t::REDUCE_FWD_TASK_ID,
+      task_id_t::REDUCTION_FWD_TASK_ID,
+      task_id_t::REPARTITION_FWD_TASK_ID,
+      task_id_t::REPLICATE_FWD_TASK_ID,
+      task_id_t::RESHAPE_FWD_TASK_ID,
+      task_id_t::REVERSE_FWD_TASK_ID,
+      task_id_t::SOFTMAX_FWD_TASK_ID,
+      task_id_t::SPLIT_FWD_TASK_ID,
+      task_id_t::TOPK_FWD_TASK_ID,
+      task_id_t::TRANSPOSE_FWD_TASK_ID,
+
+      // Backward tasks
+      task_id_t::BATCHMATMUL_BWD_TASK_ID,
+      task_id_t::BATCHNORM_BWD_TASK_ID,
+      task_id_t::BROADCAST_BWD_TASK_ID,
+      task_id_t::CAST_BWD_TASK_ID,
+      task_id_t::COMBINE_BWD_TASK_ID,
+      task_id_t::CONCAT_BWD_TASK_ID,
+      task_id_t::CONV2D_BWD_TASK_ID,
+      task_id_t::DROPOUT_BWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_BWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_BWD_TASK_ID,
+      task_id_t::EMBED_BWD_TASK_ID,
+      task_id_t::FLAT_BWD_TASK_ID,
+      task_id_t::GATHER_BWD_TASK_ID,
+      task_id_t::LAYERNORM_BWD_TASK_ID,
+      task_id_t::LINEAR_BWD_TASK_ID,
+      task_id_t::ATTENTION_BWD_TASK_ID,
+      task_id_t::POOL2D_BWD_TASK_ID,
+      task_id_t::REDUCE_BWD_TASK_ID,
+      task_id_t::REDUCTION_BWD_TASK_ID,
+      task_id_t::REPARTITION_BWD_TASK_ID,
+      task_id_t::REPLICATE_BWD_TASK_ID,
+      task_id_t::RESHAPE_BWD_TASK_ID,
+      task_id_t::REVERSE_BWD_TASK_ID,
+      task_id_t::SOFTMAX_BWD_TASK_ID,
+      task_id_t::SPLIT_BWD_TASK_ID,
+      task_id_t::TOPK_BWD_TASK_ID,
+      task_id_t::TRANSPOSE_BWD_TASK_ID,
+
+      // Update tasks
+      task_id_t::SGD_UPD_NCCL_TASK_ID,
+      task_id_t::ADAM_UPD_NCCL_TASK_ID,
+
+      // Loss task
+      task_id_t::LOSS_BWD_TASK_ID,
+  };
+
+  for (task_id_t task_id : task_ids) {
+    pending_registrations.push_back(
+        register_task(Realm::Processor::LOC_PROC, task_id, op_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::TOC_PROC, task_id, op_task_body));
+  }
+
+  pending_registrations.push_back(register_task(Realm::Processor::LOC_PROC,
+                                                task_id_t::CONTROLLER_TASK_ID,
+                                                controller_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    ff_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::TOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    ff_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                    ff_handle_init_return_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                    per_device_op_state_init_return_task_body));
+  return Realm::Event::merge_events(pending_registrations);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
new file mode 100644
index 0000000000..806059f3ed
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
@@ -0,0 +1,14 @@
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &event) {
+  return SerializableRealmEvent{event.id};
+}
+
+Realm::Event
+    realm_event_from_serializable(SerializableRealmEvent const &event) {
+  return Realm::Event{event.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
new file mode 100644
index 0000000000..0e58d6e36c
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
@@ -0,0 +1,23 @@
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "utils/exception.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+// Realm::RegionInstance is trivially copyable so it's safe to treat it as bytes
+static_assert(std::is_trivially_copy_constructible_v<Realm::RegionInstance>);
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &inst) {
+  uint8_t const *data = reinterpret_cast<uint8_t const *>(&inst);
+  return SerializableRealmInstance{
+      std::vector<uint8_t>{data, data + sizeof(inst)}};
+}
+
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &inst) {
+  ASSERT(inst.instance.size() == sizeof(Realm::RegionInstance));
+  return *reinterpret_cast<Realm::RegionInstance const *>(inst.instance.data());
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
new file mode 100644
index 0000000000..b16e2891c4
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
@@ -0,0 +1,15 @@
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &proc) {
+  return SerializableRealmProcessor{proc.id};
+}
+
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &proc) {
+  return Realm::Processor{proc.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
new file mode 100644
index 0000000000..79a5176c4f
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
@@ -0,0 +1,32 @@
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_keys_and_values.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking tensor_instance_backing_to_serializable(
+    TensorInstanceBacking const &backing) {
+  return SerializableTensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_to_serializable,
+      [](std::pair<Realm::RegionInstance, Realm::Event> const &p) {
+        return std::pair{realm_instance_to_serializable(p.first),
+                         realm_event_to_serializable(p.second)};
+      })};
+}
+
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &backing) {
+  return TensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_from_serializable,
+      [](std::pair<SerializableRealmInstance, SerializableRealmEvent> const
+             &p) {
+        return std::pair{realm_instance_from_serializable(p.first),
+                         realm_event_from_serializable(p.second)};
+      })};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
new file mode 100644
index 0000000000..dd4b0a66ca
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -0,0 +1,198 @@
+#include "realm-execution/tasks/task_id_t.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "utils/optional.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeAttrs const &node_attrs,
+                       std::optional<OptimizerAttrs> const &optimizer_attrs) {
+  DynamicTaskType task_type = assert_unwrap(node_attrs.task_type);
+  switch (task_type) {
+    case DynamicTaskType::FWD:
+      return get_fwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
+    case DynamicTaskType::BWD:
+      return get_bwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
+    case DynamicTaskType::UPD:
+      return get_update_task_id_for_optimizer_attrs(
+          assert_unwrap(optimizer_attrs));
+    case DynamicTaskType::LOSS:
+      return task_id_t::LOSS_BWD_TASK_ID;
+    default:
+      PANIC("Unhandled DynamicTaskType", task_type);
+  }
+}
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) { return std::nullopt; },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_INIT_TASK_ID; },
+      [](BroadcastAttrs const &) { return std::nullopt; },
+      [](CastAttrs const &) { return std::nullopt; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_INIT_TASK_ID; },
+      [](ConcatAttrs const &) { return std::nullopt; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_INIT_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_INIT_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_INIT_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_INIT_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return std::nullopt; },
+      [](FlatAttrs const &) { return std::nullopt; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_INIT_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_INIT_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_INIT_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_INIT_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_INIT_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_INIT_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_INIT_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_INIT_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_INIT_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return std::nullopt; },
+      [](ReverseAttrs const &) { return std::nullopt; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_INIT_TASK_ID; },
+      [](SplitAttrs const &) { return std::nullopt; },
+      [](TopKAttrs const &) { return std::nullopt; },
+      [](TransposeAttrs const &) { return std::nullopt; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_FWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_FWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_FWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_FWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_FWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_FWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_FWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_FWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_FWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_FWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_FWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_FWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_FWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_FWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_FWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_FWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_FWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_FWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_FWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_FWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_FWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_FWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_FWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_FWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_FWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_FWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_FWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_BWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_BWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_BWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_BWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_BWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_BWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_BWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_BWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_BWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_BWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_BWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_BWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_BWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_BWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_BWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_BWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_BWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_BWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_BWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_BWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_BWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_BWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_BWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_BWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_BWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_BWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_BWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t> get_update_task_id_for_optimizer_attrs(
+    OptimizerAttrs const &optimizer_attrs) {
+
+  return optimizer_attrs.visit<std::optional<task_id_t>>(overload{
+      [](SGDOptimizerAttrs const &) { return task_id_t::SGD_UPD_NCCL_TASK_ID; },
+      [](AdamOptimizerAttrs const &) {
+        return task_id_t::ADAM_UPD_NCCL_TASK_ID;
+      },
+  });
+}
+
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t task_id) {
+  return Realm::Processor::TASK_ID_FIRST_AVAILABLE +
+         static_cast<Realm::Processor::TaskFuncID>(task_id);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
new file mode 100644
index 0000000000..dea51d8c92
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
@@ -0,0 +1,25 @@
+#include "realm-execution/tensor_instance_backing.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+TensorInstanceBacking make_empty_tensor_instance_backing() {
+  return TensorInstanceBacking{
+      /*backing=*/{},
+  };
+}
+
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &backing,
+    DynamicNodeInvocation const &invocation) {
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
+  for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/CMakeLists.txt b/lib/realm-execution/test/CMakeLists.txt
new file mode 100644
index 0000000000..b3beff42c0
--- /dev/null
+++ b/lib/realm-execution/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+ff_add_test_executable(
+  NAME
+    realm-execution-tests
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    doctest
+    utils-test-common
+    realm-execution
+    kernels
+    op-attrs
+    task-spec
+)
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.cc b/lib/realm-execution/test/src/internal/realm_test_utils.cc
new file mode 100644
index 0000000000..e381feb8de
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.cc
@@ -0,0 +1,28 @@
+#include "internal/realm_test_utils.h"
+#include <fmt/format.h>
+#include <string>
+
+namespace FlexFlow {
+
+static char *leak_string_contents(std::string const &str) {
+  // Realm command-line arguments require char* so intentionally leak the
+  // allocated string contents here
+  std::vector<char> *content = new std::vector<char>{str.begin(), str.end()};
+  content->push_back(0); // NUL byte
+  return content->data();
+}
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus) {
+  std::vector<char *> result;
+  result.push_back(leak_string_contents("fake_executable_name"));
+  result.push_back(leak_string_contents("-ll:cpu"));
+  result.push_back(leak_string_contents(fmt::to_string(num_cpus)));
+  if (num_gpus > 0) {
+    result.push_back(leak_string_contents("-ll:gpu"));
+    result.push_back(leak_string_contents(fmt::to_string(num_gpus)));
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.h b/lib/realm-execution/test/src/internal/realm_test_utils.h
new file mode 100644
index 0000000000..8e2775ad8b
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
+#include <vector>
+
+namespace FlexFlow {
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc
new file mode 100644
index 0000000000..8ce5d3ed6e
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/distributed_ff_handle.cc
@@ -0,0 +1,70 @@
+#include "realm-execution/distributed_ff_handle.h"
+#include "internal/realm_test_utils.h"
+#include "realm-execution/realm_manager.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("DistributedFfHandle") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedFfHandle handle = create_distributed_ff_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("DistributedFfHandle (GPU)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedFfHandle handle = create_distributed_ff_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+
+      Realm::Machine::ProcessorQuery gpus(Realm::Machine::get_machine());
+      gpus.only_kind(Realm::Processor::TOC_PROC);
+      CHECK(gpus.count() == 1);
+      for (Realm::Processor proc : gpus) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..4063ec32f2
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -0,0 +1,33 @@
+#include "realm-execution/realm_manager.h"
+#include "internal/realm_test_utils.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmManager") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    // Initialize Realm
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    // Launch a controller
+    int some_data = 123;
+    Realm::Event event = manager.start_controller([&](RealmContext &ctx) {
+      // Data is captured and retains value
+      ASSERT(some_data == 123);
+    });
+    // Need to block on the completion of the event to ensure we don't race,
+    // because the lambda captures the environment
+    event.wait();
+  }
+}
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
new file mode 100644
index 0000000000..8e5918b0f0
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -0,0 +1,492 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch,
+                              Allocator &allocator) {
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, allocator));
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager = RealmManager{&fake_argc, &fake_argv};
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      TensorShape weight_shape_1 = TensorShape{
+          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+      TensorShape weight_shape_2 = TensorShape{
+          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_1 =
+          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_2 =
+          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_input,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_1,
+              },
+          });
+      parallel_tensor_guid_t t_linear_1 =
+          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_linear_1,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_2,
+              },
+          });
+      parallel_tensor_guid_t t_linear_2 =
+          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu1,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {linear_operator_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+              {linear_operator_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu1,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+          },
+      };
+      MappedOperatorTaskGroup loss_mapping{
+          {{cpu0,
+            OperatorAtomicTaskShardBinding{{
+                {TensorSlotName::INPUT, tensor_coord0},
+                {TensorSlotName::LOGIT, tensor_coord0},
+            }}}}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
+      }
+
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+    });
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training (GPU Model Parallelism)") {
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+    // construct computation graph
+    ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+    TensorShape label_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+    ParallelLayerAddedResult inputs_layer =
+        pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+    parallel_tensor_guid_t t_input =
+        require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{
+            PCGOperatorAttrs{WeightAttrs{
+                weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+            std::nullopt},
+        {},
+        {});
+    parallel_tensor_guid_t t_weights_1 =
+        require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{
+            PCGOperatorAttrs{WeightAttrs{
+                weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+            std::nullopt},
+        {},
+        {});
+    parallel_tensor_guid_t t_weights_2 =
+        require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                        /*use_bias=*/false,
+                                                        DataType::FLOAT,
+                                                        Activation::RELU,
+                                                        std::nullopt}},
+                           std::nullopt},
+        {
+            {
+                TensorSlotName::INPUT,
+                t_input,
+            },
+        },
+        {
+            {
+                TensorSlotName::WEIGHT,
+                t_weights_1,
+            },
+        });
+    parallel_tensor_guid_t t_linear_1 =
+        require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+    ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+        pcg,
+        ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                        /*use_bias=*/false,
+                                                        DataType::FLOAT,
+                                                        Activation::RELU,
+                                                        std::nullopt}},
+                           std::nullopt},
+        {
+            {
+                TensorSlotName::INPUT,
+                t_linear_1,
+            },
+        },
+        {
+            {
+                TensorSlotName::WEIGHT,
+                t_weights_2,
+            },
+        });
+    parallel_tensor_guid_t t_linear_2 =
+        require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+    MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+    ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+    MappedParallelComputationGraph mpcg{
+        pcg,
+        {
+            {inputs_layer.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {weights_layer_1.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {weights_layer_2.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{
+                       {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+            {linear_operator_1.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{{
+                       {TensorSlotName::INPUT, tensor_coord0},
+                       {TensorSlotName::WEIGHT, tensor_coord0},
+                       {TensorSlotName::OUTPUT, tensor_coord0},
+                   }}}}}},
+            {linear_operator_2.parallel_layer,
+             MappedOperatorTaskGroup{
+                 {{gpu0,
+                   OperatorAtomicTaskShardBinding{{
+                       {TensorSlotName::INPUT, tensor_coord0},
+                       {TensorSlotName::WEIGHT, tensor_coord0},
+                       {TensorSlotName::OUTPUT, tensor_coord0},
+                   }}}}}},
+        },
+    };
+    MappedOperatorTaskGroup loss_mapping{
+        {{gpu0,
+          OperatorAtomicTaskShardBinding{{
+              {TensorSlotName::INPUT, tensor_coord0},
+              {TensorSlotName::LOGIT, tensor_coord0},
+          }}}}};
+
+    // instantiate computation graph
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+    //! [realm-execution example]
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    Realm::Event e = manager.start_controller([&](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedFfHandle device_handle =
+          create_distributed_ff_handle(ctx,
+                                       /*workSpaceSize=*/1024 * 1024,
+                                       /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
+      }
+
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+    });
+
+    e.wait();
+    //! [realm-execution example]
+  }
+}
+
+} // namespace test
diff --git a/lib/runtime/src/machine_model.cc b/lib/runtime/src/machine_model.cc
index 73060e9b1a..ba16da9561 100644
--- a/lib/runtime/src/machine_model.cc
+++ b/lib/runtime/src/machine_model.cc
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 /// @param[in] nb_elements : size of your for loop
-/// @param[in] functor(start, end) :
+/// @param[in] functor :
 /// your function processing a sub chunk of the for loop.
 /// "start" is the first index to process (included) until the index "end"
 /// (excluded)
diff --git a/lib/runtime/src/task_spec/README.md b/lib/runtime/src/task_spec/README.md
deleted file mode 100644
index 0884e62b4d..0000000000
--- a/lib/runtime/src/task_spec/README.md
+++ /dev/null
@@ -1,176 +0,0 @@
-# task\_spec
-
-The `task_spec` interface provides an easy-to-use, high-level, and safe abstraction on top of Legion tasks.
-While not all Legion features are supported, the `task_spec` interface is capable of expressing all Legion usages in FlexFlow.
-Using `task_spec` is not mandatory (Legion still works fine, as everything simply compiles down to Legion `TaskLauncher`, etc. 
-anyway), but any code that can use `task_spec` is strongly advised to use it as it is significantly less verbose, safer, and 
-prevents common errors.
-
-The `task_spec` code consists of two parts: `TaskSignature` ([task\_signature.h](./task_signature.h)) and `TaskInvocation` ([task\_invocation.h](./task_invocation.h)), 
-which can be intuitively understood as function signatures and function calls in a typical programming language.
-`TaskSignature`s define a set of _slots_ of two kinds: 
-each can be either a _tensor slot_, which represents a parallel tensor whose Legion region will be passed to the underlying task, 
-or an _argument slot_, which can be used to pass small[^1] values of arbitrary[^2] type via `Legion::TaskArgument`.
-
-As with function signatures/calls, each task has a single `TaskSignature` but can have multiple `TaskInvocation`s.
-`TaskSignature`s are registered for `task_id_t`s via the `register_task` function, which is usually called by specializations of `template <task_id_t> register_task` 
-defined in the relevant file (e.g., [optimizer.h](../optimizer.h) and [optimizer.cc](../optimizer.cc)), which are ultimately called by 
-`register_flexflow_internal_tasks` in [tasks.cc](../tasks.cc).
-
-To execute a pair of a `TaskSignature` and a `TaskInvocation`, they must be compiled/translated/lowered to a call to a `Legion::TaskLauncher` or a 
-`Legion::IndexTaskLauncher`.
-Ideally this would simply be done in a single step, but in practice the ability to specify `TaskInvocation`s at different layers of abstraction can 
-be very useful.
-Thus, what we previously referred to as `TaskInvocation` is actually logically the following set of classes:
-
-```mermaid
-flowchart TD
-    A[OpTaskInvocation]
-    B[TaskInvocation]
-    C[ExecutableTaskInvocation]
-    D[TensorlessTaskInvocation]
-    E[IndexTaskInvocation]
-    F[Legion::TaskLauncher]
-    G[Legion::IndexTaskLauncher]
-    H[ExecutableIndexTaskInvocation]
-    I[TensorlessIndexTaskInvocation]
-    A -->|compiles down to| E
-    E -->|compiles down to| H
-    H -->|compiles down to| I 
-    I -->|compiles down to| G
-
-    B -->|compiles down to| C
-    C -->|compiles down to| D
-    D -->|compiles down to| F
-```
-Similarly, `TaskSignature` is actually divided up into `OpTaskSignature` and `TaskSignature`.
-The flow of full compilation process is as follows:
-```mermaid
-%%{init: { 'themeVariables': { 'fontFamily': 'monospace' }, 'flowchart': { 'curve': 'bumpY', 'defaultRenderer': 'elk' }, 'theme': 'default' } }%%
-flowchart TD
-    A[OpTaskInvocation]
-    B[TaskInvocation]
-    C[ExecutableTaskInvocation]
-    D[TensorlessTaskInvocation]
-    E[IndexTaskInvocation]
-    F[Legion::TaskLauncher]
-    G[Legion::IndexTaskLauncher]
-    H[ExecutableIndexTaskInvocation]
-    I[TensorlessIndexTaskInvocation]
-    J[OpTaskSignature]
-    K[TaskSignature]
-    L[ConcreteArgsFormat]
-    M[FutureArgsFormat]
-    N[TensorArgsFormat]
-    O[IndexArgsFormat]
-    P[TaskArgumentsFormat]
-    Q[Legion::TaskArgument]
-    R[Legion::ArgumentMap]
-    S[TaskReturnAccessor]
-    T[IndexTaskReturnAccessor]
-    AA[task_id_t]
-    AC[TensorlessTaskBinding]
-    AD[TensorlessIndexTaskBinding]
-    AE[task_impl function]
-    AF[task function]
-    AG[Legion::Task]
-    AH["std::vector<Legion::PhysicalRegion>"]
-    AI[Legion::Context]
-    AJ[Legion::Runtime]
-    AK[TaskArgumentAccessor]
-    AL[add_region_requirement]
-
-    A -->|compiles to| E
-    E -->|compiles to| H
-    H -->|compiles to| N
-    N -->|compiles to| P
-    N -->|invokes| AL
-    AL -->|on| G
-    H -->|compiles to| I
-    I -->|has member| AA
-    I -->|has member| AD 
-    AD -->|compiles to| M
-    AD -->|compiles to| O
-    AD -->|compiles to| L
-    O -->|compiles to| R
-    O -->|compiles to| P
-    M -->|compiles to| P
-    L -->|compiles to| P
-    M -->|compiles to| Q
-    O -->|compiles to| Q
-    L -->|compiles to| Q
-    P -->|compiles to| Q
-    Q -->|passed to| G
-    R -->|passed to| G
-    G -->|generates a| AG
-    G -->|generates a| AH
-    G -->|generates a| AI 
-    G -->|generates a| AJ
-    AG -->|passed to| AF
-    AH -->|passed to| AF
-    AI -->|passed to| AF 
-    AJ -->|passed to| AF
-    AF -->|generates a| AK
-    AK -->|passed to| AE
-    AE -->|possibly generates a| S
-    G -->|possibly generates a| S
-    K -->|possibly generates a| S
-
-    B -->|compiles to| C
-    C -->|compiles to| N
-    C -->|compiles to| D
-    D -->|has member| AA
-    D -->|has member| AC
-    AC -->|compiles to| L 
-    AC -->|compiles to| M
-    L -->|compiles to| P
-    M -->|compiles to| P 
-    L -->|compiles to| Q
-    M -->|compiles to| Q
-    P -->|compiles to| Q
-    Q -->|passed to| F
-    AL -->|on| F
-    F -->|generates a| AG
-    F -->|generates a| AH
-    F -->|generates a| AI 
-    F -->|generates a| AJ
-    AE -->|possibly generates a| T
-    G -->|possibly generates a| T
-    K -->|possibly generates a| T
-
-    J -->|compiles to| K
-```
-
-The primary difference between the different `TaskInvocation` types is which argument types they support.
-The full list of argument types is:
-- tensor slots
-  - `OpTensorSpec`: a reference to a input, output, or weight tensor attched to the given operator. 
-  - `ParallelTensorSpec`: a reference (via `parallel_tensor_guid_t`) to a parallel tensor somewhere in the PCG.
-- argument slots
-  - `OpArgRefSpec`: an argument that should be filled in during the compilation process from `OpTaskInvocation` to `TaskInvocation`. For those familiar with `Reader` monads, this is roughly analogous
-  - `ConcreteArgSpec`: a concrete value
-  - `IndexArgSpec`: a set of concrete values, each of which should be sent to a different Index Task
-  - `CheckedTypedFuture`: a legion future whose value should be passed into the task
-  - `CheckedTypedFutureMap`: a set of legion futures, each of which should have its value sent to a different Index Task (conceptually, `IndexArgSpec` + `CheckedTypedFuture`)
-  - `ArgRefSpec`: an argument that should be filled in during the compilation process from `TaskInvocation` to `ExecutableTaskInvocation`. For those familiar with `Reader` monads, this is roughly analogous
-  - `TaskInvocationSpec`: a nested task invocation which should be launched and have its resulting `Future` passed into the given task
-  - `IndexTaskInvocationSpec`: (currently not implemented, may or may not be necessary)
-
-The supported argument types for each invocation type are:
-- `OpTaskInvocation`
-  - `OpTensorSpec`, `OpArgRefSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `ArgRefSpec`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `TaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `CheckedTypedFuture`, `ArgRefSpec`, `TaskInvocationSpec`
-- `IndexTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `ArgRefSpec`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `ExecutableTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `CheckedTypedFuture`, `TaskInvocationSpec`
-- `ExecutableIndexTaskInvocation`
-  - `ParallelTensorSpec`, `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-- `TensorlessTaskInvocation`
-  - `ConcreteArgSpec`, `CheckedTypedFuture`, `TaskInvocationSpec`
-- `TensorlessIndexTaskInvocation`
-  - `ConcreteArgSpec`, `IndexArgSpec`, `CheckedTypedFuture`, `CheckedTypedFutureMap`, `TaskInvocationSpec`, `IndexTaskInvocationSpec`
-
-[^1]: i.e., not tensor-sized
-[^2]: Types must either be serializable ([serialization.h](../serialization.h)) or device-specific ([device\_specific\_arg.h](./device-specific-arg.h))
diff --git a/lib/substitutions/README.md b/lib/substitutions/README.md
deleted file mode 100644
index e9db4c6aab..0000000000
--- a/lib/substitutions/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# substitutions
-
-## Substitution
-
-A substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
-
-A `Substitution` object describes a substitution. It consists of
-* An `input_graph` of type `GraphPattern` that describes which kind of input graphs the substitution can be applied to;
-* An `output_graph` of type `OutputGraphExpr` that describes how the output graph is computed from the input graph; and
-* An `input_mapping` and `output_maping` that describes how the output graph is connected to the original PCG.
-
-### GraphPattern and MultiDiGraphPatternMatch
-
-A `GraphPattern` is defined as an open graph with node label `OperatorPattern` and output label `ParallelTensorPattern`, which is refered to as the pattern graph. The graph structure of a `GraphPattern` instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
-
-To apply a substitution to a PCG, we should first match the pattern graph to a subgraph of the PCG. `MultiDiGraphPatternMatch` describes the match, which consists of
-* `node_assignment`: a mapping from the nodes of the pattern graph to the nodes of the PCG; and
-* `edge_assignment`: a mapping from the edges of the pattern graph to the nodes of the PCG.
-The input graph derived by this match is then defined by `values(node_assignment)` and `values(edge_assignment)`. A match is valid if and only if
-* `node_assignment` and `edge_assignment` are injections;
-* For every node `n` in the pattern graph, `edge_assignment` derives a bijection between `query_edges({n})` and `query_edges({node_assignment.at_l(n)})`.
-
-### OutputGraphExpr
-
-An `OutputGraphExpr` is defined as an open graph with node label `OperatorAttrAssignment` and output label `ParallelTensorAttrAssignment`, which defines how the operator attributes and the parallel tensor attributes of the output graph are derived from the input graph.
-
-`OperatorAttrAssignment` is a collection of `OperatorAttributeKey` and `GraphAttributeExpr` pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair `{operator_attribute_key, graph_attribute_expr}` in the collection means the value of `graph_attribute_expr` is assigned to the attribute named `operator_attribute_key` of the operator.
-
-`ParallelTensorAttrAssignment` is defined in the similar way to `OperatorAttrAssignment`.
-
-`GraphAttributeExpr` is defined as one of `NodeAttrAccess`, `EdgeAttrAccess` and `AttrConstant`:
-* `NodeAttrAccess` consists of a node `node` and an expression `attr_expr` on the attributes of the operator associated with the node. The value of a `NodeAttrAccess` instance is the value of `attr_expr` evaluated on the operator associated with the node.
-* `EdgeAttrAccess` is defined in the similar way to `NodeAttrAccess`.
-* `AttrConstant` consists of a constant `value`. The value of an `AttrConstant` instance is `value`.
diff --git a/lib/substitutions/include/substitutions/substitution.dtg.toml b/lib/substitutions/include/substitutions/substitution.dtg.toml
index 5daeaceded..bd98efc71b 100644
--- a/lib/substitutions/include/substitutions/substitution.dtg.toml
+++ b/lib/substitutions/include/substitutions/substitution.dtg.toml
@@ -15,15 +15,27 @@ includes = [
 [[fields]]
 name = "pcg_pattern"
 type = "::FlexFlow::PCGPattern"
+docstring = """
+Describes which kind of input graphs the substitution can be applied to
+"""
 
 [[fields]]
 name = "output_graph_expr"
 type = "::FlexFlow::OutputGraphExpr"
+docstring = """
+Describes how the output graph is computed from the input graph
+"""
 
 [[fields]]
 name = "inputs_mapping"
 type = "::FlexFlow::bidict<::FlexFlow::PatternInput, ::FlexFlow::OutputGraphExprInput>"
+docstring = """
+Describes how the values matched by the pattern's inputs are connected to the original ParallelComputationGraph
+"""
 
 [[fields]]
 name = "outputs_mapping"
 type = "::FlexFlow::bidict<::FlexFlow::PatternNodeOutput, ::FlexFlow::OutputGraphExprNodeOutput>"
+docstring = """
+Describes how the values matched by the pattern's outputs are connected to the original ParallelComputationGraph
+"""
diff --git a/lib/substitutions/index.dox b/lib/substitutions/index.dox
new file mode 100644
index 0000000000..821a81a59e
--- /dev/null
+++ b/lib/substitutions/index.dox
@@ -0,0 +1,30 @@
+namespace FlexFlow {
+/**
+
+\page substitutions substitutions
+
+\brief Contains the definitions of PCG substitutions (i.e., Substitution), as well as the code for serializing them.
+
+\section substitution Substitution
+
+A \ref Substitution is to replace a subgraph of the PCG by a new one. We refer to the subgraph to be replaced as the input graph, and the new subgraph to replace the input graph as the output graph.
+
+\section pattern-matches PCGPattern and MultiDiGraphPatternMatch
+
+A \ref PCGPattern is defined as an open graph with node label OperatorPattern and output label ParallelTensorPattern, which is refered to as the pattern graph. The graph structure of a GraphPattern instance defines the geometrical property of the input graph, while the node labels and output labels define the attribute property of that.
+
+To apply a substitution to a PCG, we should first match the pattern graph to a subgraph of the PCG. `MultiDiGraphPatternMatch` describes the match, which consists of
+* `node_assignment`: a mapping from the nodes of the pattern graph to the nodes of the PCG; and
+* `edge_assignment`: a mapping from the edges of the pattern graph to the nodes of the PCG.
+The input graph derived by this match is then defined by `values(node_assignment)` and `values(edge_assignment)`. A match is valid if and only if
+* `node_assignment` and `edge_assignment` are injections;
+* For every node `n` in the pattern graph, `edge_assignment` derives a bijection between `query_edges({n})` and `query_edges({node_assignment.at_l(n)})`.
+
+\section output-graph-expr OutputGraphExpr
+
+An \ref OutputGraphExpr is defined as an open graph with node label \ref OutputOperatorAttrsAssignment and output label \c std::monostate.
+
+\ref OutputOperatorAttrsAssignment is a collection of \ref OperatorAttributeKey and \ref OutputOperatorAttributeExpr pairs. It defines how the attributes of a single operator is calculated from the input graph. A pair \c "{operator_attribute_key, output_operator_attribute_expr}" in the collection means the value of \c output_operator_attribute_expr is assigned to the attribute named \c operator_attribute_key of the operator.
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
index c6e6673f33..bd64f52567 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
index 75e9099104..c9171b928b 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/index.dox b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
new file mode 100644
index 0000000000..04ceaf4935
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/index.dox
@@ -0,0 +1,17 @@
+namespace FlexFlow {
+/**
+
+\page task-spec-dynamic-graph task-spec/dynamic_graph
+
+\brief Contains common code for inferring and making explicit information from a \ref MappedParallelComputationGraph or \ref ComputationGraph, lowering it into a \ref DynamicOpenDataflowGraph that can be executed by \ref realm-execution and/or \ref local-execution, respectively.
+
+\section task-spec-lowering-passes Lowering Passes
+
+- \ref pass_expansion.h
+- \ref shard_expansion.h
+- \ref update_insertion.h
+- \ref loss_insertion.h
+- \ref machine_slicing.h
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
index c7cef3f06f..b3b2a465f8 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
+++ b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
@@ -6,12 +6,15 @@
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion_result.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor);
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
new file mode 100644
index 0000000000..758a0c2813
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
new file mode 100644
index 0000000000..3c43e1d637
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
@@ -0,0 +1,43 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_task_type.dtg.h",
+  "pcg/machine_space_coordinate.dtg.h",
+  "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h",
+  "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h",
+  "task-spec/dynamic_graph/training_operation_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "task_type"
+type = "std::optional<::FlexFlow::DynamicTaskType>"
+
+[[fields]]
+name = "device_coord"
+type = "std::optional<::FlexFlow::MachineSpaceCoordinate>"
+
+[[fields]]
+name = "mapping"
+type = "std::optional<::FlexFlow::MappedOperatorTaskGroup>"
+
+[[fields]]
+name = "op_attrs"
+type = "std::optional<::FlexFlow::TrainingOperationAttrs>"
+
+[[fields]]
+name = "layer_guid"
+type = "::FlexFlow::dynamic_layer_guid_t"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
new file mode 100644
index 0000000000..7a274a1e7b
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &);
+DynamicNodeAttrs
+    dynamic_node_attrs_from_serializable(SerializableDynamicNodeAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
new file mode 100644
index 0000000000..01f4cc8876
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
@@ -0,0 +1,33 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeInvocation"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "inputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
+
+[[fields]]
+name = "node_attrs"
+type = "::FlexFlow::SerializableDynamicNodeAttrs"
+
+[[fields]]
+name = "outputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
new file mode 100644
index 0000000000..2bcdb9a898
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation
+    dynamic_node_invocation_to_serializable(DynamicNodeInvocation const &);
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
new file mode 100644
index 0000000000..6209bfa247
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
@@ -0,0 +1,38 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicValueAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h",
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "op-attrs/parallel_tensor_space_coordinate.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "tensor_guid"
+type = "::FlexFlow::dynamic_tensor_guid_t"
+
+[[fields]]
+name = "parallel_tensor_shape"
+type = "std::optional<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "shard_coord"
+type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>"
+
+[[fields]]
+name = "role"
+type = "std::optional<::FlexFlow::DynamicTensorRole>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
new file mode 100644
index 0000000000..6272265b7e
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &);
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
index 66c475b3a9..1051d8ac13 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
index fddad49ddf..17f59702b3 100644
--- a/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
@@ -23,11 +23,13 @@ std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x);
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 template <>
 struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> {
   size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const;
 };
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/task-spec/include/task-spec/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h
index a4707a2f6f..c17fb62af5 100644
--- a/lib/task-spec/include/task-spec/generic_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h
@@ -23,11 +23,13 @@ std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x);
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 template <>
 struct hash<::FlexFlow::GenericTaskImplFunction> {
   size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const;
 };
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/task-spec/include/task-spec/ops/impl/dropout.h b/lib/task-spec/include/task-spec/ops/impl/dropout.h
index a7b382ce62..192f2f8244 100644
--- a/lib/task-spec/include/task-spec/ops/impl/dropout.h
+++ b/lib/task-spec/include/task-spec/ops/impl/dropout.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_IMPL_DROPOUT_H
 
 #include "op-attrs/ops/dropout_attrs.dtg.h"
-#include "task-spec/task_id_t.dtg.h"
 #include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/include/task-spec/ops/index.dox b/lib/task-spec/include/task-spec/ops/index.dox
new file mode 100644
index 0000000000..6910063ecd
--- /dev/null
+++ b/lib/task-spec/include/task-spec/ops/index.dox
@@ -0,0 +1,9 @@
+namespace FlexFlow {
+/**
+
+\page task-spec-ops task-spec/ops
+
+\brief Contains the runtime-generic operator implementations, i.e., the adapter code between the runtimes and \ref kernels.
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml b/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
deleted file mode 100644
index 557da6cf4c..0000000000
--- a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-namespace = "FlexFlow"
-name = "op_task_id_t"
-type = "enum"
-features = [
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[values]]
-name = "INIT"
-
-[[values]]
-name = "FWD"
-
-[[values]]
-name = "BWD"
diff --git a/lib/task-spec/include/task-spec/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h
index 68d3f98ebf..8783f902e4 100644
--- a/lib/task-spec/include/task-spec/per_device_op_state.h
+++ b/lib/task-spec/include/task-spec/per_device_op_state.h
@@ -8,7 +8,7 @@
 
 namespace FlexFlow {
 
-PerDeviceOpState get_device_state_from_device_specific(
+PerDeviceOpState get_per_device_op_state_from_device_specific(
     DeviceSpecificPerDeviceOpState const &, device_id_t device_idx);
 
 }
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/index.dox b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
new file mode 100644
index 0000000000..9c42a19188
--- /dev/null
+++ b/lib/task-spec/include/task-spec/task_argument_accessor/index.dox
@@ -0,0 +1,82 @@
+namespace FlexFlow {
+/**
+
+\page task-argument-accessor TaskArgumentAccessor Interface
+
+\brief TaskArgumentAccessor provides a interface for operator implementations to access arguments while hiding the details of the underlying execution engine (i.e., \ref local-execution or \ref realm-execution).
+
+\section Background and Motivation
+
+\ref TaskArgumentAccessor was originally designed when %FlexFlow was using %Legion, which required
+tasks to have the following signature:
+
+\code
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime);
+\endcode
+
+The task implementation would then proceed to access the necessary arguments/context through these four parameters.
+However, this made the code difficult to test, as creating/mocking these input objects was difficult and often even nonsensical in, for example, non-distributed settings.
+\ref TaskArgumentAccessor was designed to provide an intermediate layer, such that you could transform the above code into
+
+\code
+struct LegionArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LegionArgumentAccessor>(
+        task, regions, ctx, runtime);
+
+  return example_task_impl(accessor);
+}
+
+void example_task_impl(TaskArgumentAccessor const &accessor);
+\endcode
+
+That way, if we wanted to also call this in a non-distributed context, rather than having to create or mock the %Legion arguments, we can just add an additional implementation of ITaskArgumentAccessor which just access the arguments locally, and then we can also execute the task in a local context, all while leaving the actual task implementation unchanged.
+
+\code
+struct LegionArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(Legion::Task const *task,
+                  std::vector<Legion::PhysicalRegion> const &regions,
+                  Legion::Context ctx,
+                  Legion::Runtime *runtime)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LegionArgumentAccessor>(
+        task, regions, ctx, runtime);
+
+  return example_task_impl(accessor);
+}
+
+struct LocalArgumentAccessor : public ITaskArgumentAccessor { ... };
+
+void example_task(MyLocalArgs const &my_args)
+{
+  TaskArgumentAccessor accessor
+    = TaskArgumentAccessor::create<LocalArgumentAccessor>(my_local_args);
+
+  return example_task_impl(accessor);
+}
+
+void example_task_impl(TaskArgumentAccessor const &accessor);
+\endcode
+
+\section Current Design
+
+TaskArgumentAccessor is just a thin, ref-counted wrapper around the abstract ITaskArgumentAccessor interface.
+Instances of ITaskArgumentAccessor provide access to the following arguments:
+
+- One of \ref PCGOperatorAttrs, \ref LossAttrs, or \ref OptimizerAttrs depending on whether this task is for an operator, an optimizer, or a loss function.
+- Two pieces of device-specific state: \ref device_handle_t (aka \ref PerDeviceFFHandle) and \ref PerDeviceOpState. As both of these contain pointers and hold device-specific initialization state, in distributed execution their addresses (rather than their contents) are passed around, and they are only valid on the device they originated on. One \ref PerDeviceFFHandle should be created per device, while one \ref PerDeviceOpState should be create for every operator for every device it runs on.
+- A few simple value types communicating runtime-wide settings: \ref ProfilingSettings, \ref DeviceType, and \ref FFIterationConfig.
+
+*/
+}
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
deleted file mode 100644
index 50349d5773..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-namespace = "FlexFlow"
-name = "task_id_with_noop_default_t"
-type = "variant"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-  "rapidcheck",
-]
-
-includes = [
-  "task-spec/task_id_t.dtg.h",
-  "<utility>",
-]
-
-src_includes = [
-  "utils/rapidcheck/monostate.h",
-  "utils/fmt/monostate.h",
-]
-
-[[values]]
-type = "::FlexFlow::task_id_t"
-key = "real_task"
-
-[[values]]
-type = "std::monostate"
-key = "noop_task"
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
deleted file mode 100644
index 054b73844e..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-
-#include "op-attrs/computation_graph_op_attrs.dtg.h"
-#include "op-attrs/operator_type.dtg.h"
-#include "task-spec/ops/op_task_id_t.dtg.h"
-#include "task-spec/task_id_with_noop_default_t.dtg.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t);
-task_id_with_noop_default_t default_noop_task();
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t, ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/index.dox b/lib/task-spec/index.dox
new file mode 100644
index 0000000000..187bbe9638
--- /dev/null
+++ b/lib/task-spec/index.dox
@@ -0,0 +1,14 @@
+namespace FlexFlow {
+/**
+
+\page task-spec task-spec
+
+\brief An intermediate layer between the compiler and the runtime. Contains code for lowering the \ref MappedParallelComputationGraph exported from the \ref compiler down to a granularity that the runtime can actually execute. Also contains the functions that translate between logical operators (i.e., \ref op-attrs-ops) and actual calls to \ref kernels.
+
+Primary components:
+- \subpage task-spec-ops "": \copybrief task-spec-ops
+- \subpage task-spec-dynamic-graph "": \copybrief task-spec-dynamic-graph
+- \subpage task-argument-accessor "": \copybrief task-argument-accessor
+
+*/
+}
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
index 4270119612..857fed1a84 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor) {
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping) {
   DynamicValueAttrs logit_value = assert_unwrap(
       find_output_value_attrs(dg, logit_tensor, mk_dynamic_tensor_role_fwd()));
 
@@ -45,7 +47,7 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       DynamicNodeAttrs{
           /*task_type=*/DynamicTaskType::LOSS,
           /*device_coord=*/std::nullopt,
-          /*mapping=*/std::nullopt,
+          /*mapping=*/loss_mapping,
           /*op_attrs=*/TrainingOperationAttrs{loss_attrs},
           /*layer_guid=*/mk_dynamic_layer_guid_for_loss(),
           /*per_device_op_state=*/std::nullopt,
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
new file mode 100644
index 0000000000..ced98dfd44
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -0,0 +1,78 @@
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/pcg_operator_attrs.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_role.h"
+#include "utils/containers/generate_map.h"
+#include <optional>
+#include <unordered_map>
+#include <utility>
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &mpcg) {
+  DynamicOpenDataflowGraph result = make_empty_dynamic_open_dataflow_graph();
+
+  for (auto const &[layer, attrs] :
+       get_parallel_layer_attrs_mapping(mpcg.pcg)) {
+    DynamicNodeAttrs result_attrs{
+        /*task_type=*/std::nullopt,
+        /*device_coord=*/std::nullopt,
+        /*mapping=*/mpcg.mapped_tasks.at(layer),
+        /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
+        /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
+        /*per_device_op_state=*/std::nullopt,
+    };
+
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_inputs =
+        transform(get_incoming_tensors(mpcg.pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_outputs =
+        transform(get_outgoing_tensors(mpcg.pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+
+    result.invocations.emplace(result_inputs, result_attrs, result_outputs);
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
new file mode 100644
index 0000000000..d613194d14
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
@@ -0,0 +1,29 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &attrs) {
+  return SerializableDynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+  };
+}
+
+DynamicNodeAttrs dynamic_node_attrs_from_serializable(
+    SerializableDynamicNodeAttrs const &attrs) {
+  return DynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+      /*per_device_op_state=*/std::nullopt,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
new file mode 100644
index 0000000000..334623ee67
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
@@ -0,0 +1,31 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_values.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation dynamic_node_invocation_to_serializable(
+    DynamicNodeInvocation const &invocation) {
+  return SerializableDynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_to_serializable),
+      /*node_attrs=*/dynamic_node_attrs_to_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_to_serializable),
+  };
+}
+
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &invocation) {
+  return DynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_from_serializable),
+      /*node_attrs=*/
+      dynamic_node_attrs_from_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_from_serializable),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
new file mode 100644
index 0000000000..2dc0b509ab
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
@@ -0,0 +1,27 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &attrs) {
+  return SerializableDynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*role=*/attrs.role,
+  };
+}
+
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &attrs) {
+  return DynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*accessor=*/std::nullopt,
+      /*role=*/attrs.role,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index ea253b63f8..402e0ef055 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -15,7 +15,7 @@ bool value_is_shard_expanded(DynamicValueAttrs const &n) {
 
 bool no_part_of_graph_is_shard_expanded(DynamicOpenDataflowGraph const &g) {
   auto slot_is_shard_expanded = [](DynamicTensorSlot const &) -> bool {
-    return true;
+    return false;
   };
 
   return no_part_of_dynamic_graph_satisfies(g,
@@ -81,4 +81,19 @@ std::unordered_set<DynamicNodeInvocation>
       });
 }
 
+DynamicOpenDataflowGraph
+    perform_shard_expansion(DynamicOpenDataflowGraph const &g) {
+
+  ASSERT(no_part_of_graph_is_shard_expanded(g));
+
+  DynamicOpenDataflowGraph result =
+      flatmap_dynamic_invocation_set(g, [&](DynamicNodeInvocation const &i) {
+        return perform_shard_expansion_for_invocation(i);
+      });
+
+  ASSERT(graph_is_fully_shard_expanded(result));
+
+  return result;
+}
+
 } // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
index 3450b5d268..9b040b6021 100644
--- a/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
@@ -46,9 +46,12 @@ std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::FwdBwdOpTaskImplFunction>::operator()(
     ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
+///\endcond
+
 } // namespace std
diff --git a/lib/task-spec/src/task-spec/generic_task_impl_function.cc b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
index 4abd1ab644..84bed4e9d2 100644
--- a/lib/task-spec/src/task-spec/generic_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
@@ -45,9 +45,12 @@ std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::GenericTaskImplFunction>::operator()(
     ::FlexFlow::GenericTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
+///\endcond
+
 } // namespace std
diff --git a/lib/task-spec/src/task-spec/init_op_task_impl_function.cc b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
index 4cd55fc488..ce72dcb630 100644
--- a/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
@@ -45,9 +45,11 @@ std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 size_t hash<FlexFlow::InitOpTaskImplFunction>::operator()(
     ::FlexFlow::InitOpTaskImplFunction const &x) const {
   return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
 }
 } // namespace std
+///\endcond
diff --git a/lib/task-spec/src/task-spec/per_device_op_state.cc b/lib/task-spec/src/task-spec/per_device_op_state.cc
index 12b649e663..438cd8886c 100644
--- a/lib/task-spec/src/task-spec/per_device_op_state.cc
+++ b/lib/task-spec/src/task-spec/per_device_op_state.cc
@@ -3,7 +3,7 @@
 
 namespace FlexFlow {
 
-PerDeviceOpState get_device_state_from_device_specific(
+PerDeviceOpState get_per_device_op_state_from_device_specific(
     DeviceSpecificPerDeviceOpState const &device_specific,
     device_id_t device_idx) {
   return device_specific.visit<PerDeviceOpState>(
diff --git a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
deleted file mode 100644
index 20e0d00c57..0000000000
--- a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "task-spec/task_id_with_noop_default_t.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t task_id) {
-  return task_id_with_noop_default_t{task_id};
-}
-
-task_id_with_noop_default_t default_noop_task() {
-  return task_id_with_noop_default_t{std::monostate{}};
-}
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t op_task_id, ComputationGraphOpAttrs const &op_attrs) {
-  switch (op_task_id) {
-    case op_task_id_t::INIT:
-      return get_init_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::FWD:
-      return get_fwd_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::BWD:
-      return get_bwd_task_id_for_op_attrs(op_attrs);
-    default:
-      PANIC("Unhandled op_task_id_t", op_task_id);
-  }
-}
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) { return default_noop_task(); },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_INIT_TASK_ID);
-      },
-      [](BroadcastAttrs const &) { return default_noop_task(); },
-      [](CastAttrs const &) { return default_noop_task(); },
-      [](ConcatAttrs const &) { return default_noop_task(); },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_INIT_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_INIT_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_INIT_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) { return default_noop_task(); },
-      [](FlatAttrs const &) { return default_noop_task(); },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_INIT_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_INIT_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_INIT_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_INIT_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_INIT_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_INIT_TASK_ID);
-      },
-      [](ReshapeAttrs const &) { return default_noop_task(); },
-      [](ReverseAttrs const &) { return default_noop_task(); },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_INIT_TASK_ID);
-      },
-      [](SplitAttrs const &) { return default_noop_task(); },
-      [](TopKAttrs const &) { return default_noop_task(); },
-      [](TransposeAttrs const &) { return default_noop_task(); },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_FWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_FWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_FWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_FWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_FWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_FWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_FWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_FWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_FWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_FWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_FWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_FWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_FWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_FWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_FWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_FWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_FWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_FWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_FWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_FWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_FWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_FWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_BWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_BWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_BWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_BWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_BWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_BWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_BWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_BWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_BWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_BWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_BWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_BWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_BWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_BWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_BWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_BWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_BWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_BWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_BWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_BWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_BWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_BWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-} // namespace FlexFlow
diff --git a/lib/utils/README.md b/lib/utils/README.md
deleted file mode 100644
index a9c1ad3e88..0000000000
--- a/lib/utils/README.md
+++ /dev/null
@@ -1,449 +0,0 @@
-# utils
-
-## visitable
-
-[!WARNING]
-`visitable` is deprecated, new code should instead use `dtgen`
-
-### Motivation
-
-FlexFlow's codebase makes heavy use of "plain old data"[^2] types[^1] (referred to as _product types_ in the rest of this document) such as the following:
-```cpp
-struct Person {
-  std::string first_name;
-  std::string last_name;
-  int age;
-};
-```
-However, this standard implementation defines a set of behaviors that we, the FlexFlow developers, find undesirable:
-
-1. Partial constructibility: for many product types partial constructibility can make code bug-prone. For example, let us consider the following valid code:
-```cpp
-struct Person {
-  Person() = delete;
-
-  std::string first_name;
-  std::string last_name;
-  int age = 0;
-};
-
-Person p{"donald", "knuth"};
-```
-This code will compile just fine, but will silently use a nonsensical value of `age`. 
-Even worse, let us imagine that in the someone else adds an additional field `is_male`. 
-Unless they find and update every place in which `Person` is constructed, they will be left with the following code, which 
-compiles without errors but is (as of writing this) incorrect.
-```cpp
-struct Person {
-  Person() = delete;
-
-  std::string first_name;
-  std::string last_name;
-  int age = 0;
-  bool is_male = false;
-};
-
-Person p{"donald", "knuth", 85};
-```
-
-Not only can single fields be undefined/invalid, but whole structs can silently be filled with incorrect values if default constructibility is enabled:
-```cpp
-Person some_function() {
-  Person p;
-  if (...) {
-    p = {"donald", "knuth", 85};
-  }
-  return p;
-}
-```
-If the `if` branch is not taken, we will return a `Person` with nonsensical values, as there do not exist any values that naturally form a default.
-We could initalize the values as follows
-```cpp
-struct Person {
-  std::string first_name; // initializes to ""
-  std::string last_name; // initializes to ""
-  int age = 0;
-}
-```
-but this is a completely useless value, and if it shows up anywhere in our code it's probably a bug, since a nameless, age 0 person is probably not a helpful value to have.
-
-3. For product types, `operator==` and `operator!=` are trivial, but still have to be written and maintained, and can easily lead to bugs. For example, 
-```
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age;
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-};
-```
-If we take the previous example of adding an additional `is_male` field to `Person`, it can be easy to miss a location, leading to incorrectness. 
-For example, we could quite easily end up with
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age;
-      // oops, forgot to update with the new is_male field. Have fun debugging :P
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-```
-and for product types with more fields this grows increasingly tedious to write and maintain. 
-
-4. Hashing: hashing product types is relatively trivial, as long as each of the fields is hashable. But again, we have to do a bunch of extra work to specify this, and this work has to be done for each product type in the codebase.
-(**Note:** from here on the examples are growing to grow increasingly long to emphasize the amount of code that needs to be written. Feel free to skim these longer code snippets if you trust the statement that implementing product types in vanilla C++ is tedious)
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age
-      || lhs.is_male != rhs.is_male;
-  }
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-
-// BEGIN new code
-namespace std {
-
-template <>
-struct hash<::Person> {
-  size_t operator()(::Person const &p) const {
-    size_t result = 0;
-    hash_combine(result, p.first_name);
-    hash_combine(result, p.last_name);
-    hash_combine(result, p.age);
-    hash_combine(result, p.is_male);
-  }
-};
-// END new code
-
-}
-```
-and if we also want to support `std::set<Person>` (which requires `Person` to be ordered), we also have to add `operator<`
-```cpp
-struct Person {
-  Person() = delete;
-  Person(std::string const &first_name, 
-         std::string const &last_name, 
-         int age,
-         bool is_male)
-    : first_name(first_name),
-      last_name(last_name),
-      age(age),
-      is_male(is_male)
-    { }
-
-  friend bool operator==(Person const &lhs, Person const &rhs) {
-    return lhs.first_name == rhs.first_name 
-      && lhs.last_name == rhs.last_name
-      && lhs.age == rhs.age
-      && lhs.is_male == rhs.is_male;
-  }
-
-  friend bool operator!=(Person const &lhs, Person const &rhs) {
-    return lhs.first_name != rhs.first_name 
-      || lhs.last_name != rhs.last_name
-      || lhs.age != rhs.age
-      || lhs.is_male != rhs.is_male;
-  }
-
-// BEGIN new code
-  friend bool operator<(Person const &lhs, Person const &rhs) {
-    return lhs.first_name < rhs.first_name 
-      || lhs.last_name < rhs.last_name
-      || lhs.age < rhs.age
-      || lhs.is_male < rhs.is_male;
-  }
-// END new code
-
-  std::string first_name;
-  std::string last_name;
-  int age;
-  bool is_male;
-};
-
-namespace std {
-
-template <>
-struct hash<::Person> {
-  size_t operator()(::Person const &p) const {
-    size_t result = 0;
-    hash_combine(result, p.first_name);
-    hash_combine(result, p.last_name);
-    hash_combine(result, p.age);
-    hash_combine(result, p.is_male);
-  }
-};
-
-}
-```
-Even for such a simple datatype, we have a significant amount of code that must be written and maintained.
-FlexFlow's codebase contains tens if not hundreds of these product types, and so the approach above is infeasible.
-
-[^1]: aka product types, aka Haskell's `data`. Essentially types that are just a tuple of fields with names.
-[^2]: by "plain old data" we refer to the general idea behind [C++'s POD](https://en.cppreference.com/w/cpp/named_req/PODType), but not its exact definition
-
-### Adding new visitable types
-
-FlexFlow's `visitable` support provides an easy way to express product types, and prevents any of the bugs listed above.
-To express the above definition of `Person` using `visitable`, we would write the following code:
-```cpp
-struct Person {
-  std::string first_name;
-  std::string last_name;
-  int age;
-  req<bool> is_male;
-};
-FF_VISITABLE_STRUCT(Person, first_name, last_name, age, is_male);
-```
-The key addition here is the calling the `FF_VISITABLE_STRUCT` macro. 
-In addition to defining all of the above functions, this macro also performs a series of compile-time checks (via `static_assert`) to check that the product type is implemented correctly (for example, it will check that the type is not default constructible[^3]).
-The only additional change is the addition of the `req` (which stands for `required`) wrapper on the last field. 
-Conceptually, `req` is simple: it removes default constructibility of the type it wraps (if the last field in the struct is already not default-constructible, no `req` is needed).
-Don't worry if you forget to add a `req`: `FF_VISITABLE_STRUCT` will check that your type properly disables default and partial construction (see [Macro Reference](#macro-reference)).
-Combined with [aggregate initialization](https://en.cppreference.com/w/cpp/language/aggregate_initialization), we are able to construct a `Person` as follows:
-```cpp
-Person p = { "donald", "knuth", 85, true };
-```
-and any subset of the fields would raise an error at compile time. Without any additional code, `Person` supports `operator==`, `operator!=`, `std::hash`, and `operator<`, as well as other more specific features (e.g., [JSON serialization](#json-serialization))
-
-[^3]: The full list of properties is detailed in [Macros Details](#macro-reference)
-
-### Limitations
-
-`visitable` types have two primary limitations. First, they do not support initialization with `(...)`:
-```cpp
-Person p{ "donald", "knuth", 85, true }; // CORRECT
-Person p2("robert", "tarjan", 75, true); // ERROR
-```
-Secondly, template types cannot be visitable (we hope to remove this limitation in the distant future), but instantiations of them can.
-```cpp
-template <typename T>
-struct MyLists {
-  std::vector<T> list1;
-  req<std::vector<T>> list2;
-};
-FF_VISITABLE_STRUCT(MyLists, list1, list2); // ERROR
-
-using MyInts = MyLists<int>;
-
-FF_VISITABLE_STRUCT(MyInts, list1, list2); // CORRECT
-```
-A smaller limitation is that `FF_VISITABLE_STRUCT` only works from within the `FlexFlow` namespace (this is not much of an issue as all of the `FlexFlow` code resides in a single namespace).
-
-### Advanced Features
-
-While `FF_VISITABLE_STRUCT` matches the behavior of many product types in FlexFlow's codebase, there are exceptions. Many of these resemble the code below:
-```cpp
-struct Cow { ... };
-
-struct TownPopulation {
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-```
-Unlike in the `Person` example, `TownPopulation` has an obvious default value: an empty town (i.e., both people and cow are empty).
-However, if we write
-```cpp
-FF_VISITABLE_STRUCT(TownPopulation, people, cows); // ERROR: TownPopulation should not be default constructible
-```
-we get the something approximating the error in the comment.
-If we were to abandon `visitable` entirely, we would have to write (**Note:** long code example to demonstrate how tedious this is, feel free to skim)
-```cpp
-struct Cow { ... };
-
-struct TownPopulation {
-  TownPopulation() = default;
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  friend bool operator==(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people == rhs.people 
-      && lhs.cows == rhs.cows;
-  }
-
-  friend bool operator!=(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people != rhs.people
-      || lhs.cows != rhs.cows;
-  }
-
-  friend bool operator<(TownPopulation const &lhs, TownPopulation const &rhs) {
-    return lhs.people < rhs.people
-      || lhs.cows < rhs.cows;
-  }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-
-namespace std {
-
-template <>
-struct hash<::TownPopulation> {
-  size_t operator()(::TownPopulation const &t) const {
-    size_t result = 0;
-    hash_combine(result, t.people);
-    hash_combine(result, t.cows);
-    return result;
-  }
-};
-
-}
-```
-which is tedious and bug-prone.
-To remove the constructibility checks performed by `FF_VISITABLE_STRUCT`, we simply use `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION` instead:
-```cpp
-struct TownPopulation {
-  TownPopulation() = default;
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TownPopulation, people, cows);
-```
-This is also useful for defining structs with specific non-standard constructor signatures. For example,
-```cpp
-struct TownPopulation {
-  TownPopulation() = default;
-
-  // constructs a TownPopulation filled with the given number of random people and cows
-  TownPopulation(int num_people,
-                 int num_cows)
-    : people(generate_random_people_of_size(num_people)),
-      cows(generate_random_cows_of_size(num_cows))
-  { }
-
-  TownPopulation(std::vector<Person> const &people,
-                 std::vector<Cow> const &cows)
-    : people(people), 
-      cows(cows)
-  { }
-
-  std::vector<Person> people;
-  std::vector<Cow> cows;
-};
-```
-
-#### JSON Serialization
-
-TODO
-
-### Macro Reference
-
-The properties that are checked by each macro are as follows:
-
-1. `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TYPENAME, ...fields...)`
-  - If `length(fields) > 0`:
-    - Every field in `TYPENAME` is `std::hash`able
-    - Every field in `TYPENAME` is listed under `fields`
-    - `TYPENAME` is copy constructible
-    - `TYPENAME` is move constructible
-    - `TYPENAME` is copy assignable
-    - `TYPENAME` is move assignable
-    - Every field in `TYPENAME` supports `operator==` 
-    - Every field in `TYPENAME` supports `operator!=`
-  - If `length(fields) == 0`:
-    - `TYPENAME` is copy constructible
-    - `TYPENAME` is move constructible
-    - `TYPENAME` is copy assignable
-    - `TYPENAME` is move assignable
-
-2. `FF_VISITABLE_STRUCT(TYPENAME, ...fields...)` (in addition to the checks in `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION`)
-  - If `length(fields) > 0`:
-    - `TYPENAME` is only constructible when all fields are passed in[^4] 
-  - If `length(fields) == 0`:
-    - `TYPENAME` is default constructible
-
-[^4]: This is usually resolved by either wrapping the last field in a `req` or using `FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION`
-
-### Internals
-
-TODO
-
-## stack_vector, stack_string, stack_map
-
-## strong_typedef
-
-## containers
-
-## graph
-
-## bidict
-
-## type_traits
-
-## test_types
diff --git a/lib/utils/include/utils/any_value_type/any_value_type.h b/lib/utils/include/utils/any_value_type/any_value_type.h
index a99ce5c8f0..fc4d6b488d 100644
--- a/lib/utils/include/utils/any_value_type/any_value_type.h
+++ b/lib/utils/include/utils/any_value_type/any_value_type.h
@@ -34,7 +34,9 @@ struct any_value_type {
   std::function<size_t(std::any const &)> hash;
   std::function<std::string(std::any const &)> to_string;
 
+  ///\cond
   friend std::hash<any_value_type>;
+  ///\endcond
 };
 
 template <typename T>
@@ -54,6 +56,7 @@ any_value_type make_any_value_type(T const &t) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 
 template <>
@@ -62,5 +65,6 @@ struct hash<::FlexFlow::any_value_type> {
 };
 
 } // namespace std
+///\endcond
 
 #endif
diff --git a/lib/utils/include/utils/cli/index.dox b/lib/utils/include/utils/cli/index.dox
new file mode 100644
index 0000000000..a89dca5364
--- /dev/null
+++ b/lib/utils/include/utils/cli/index.dox
@@ -0,0 +1,9 @@
+/**
+
+\page utils-cli utils/cli
+
+A basic CLI library for use by programs in \ref bin.
+For an example of how to use it, see the following snippet from \ref export-model-arch.
+
+\snippet bin/export-model-arch/src/export-model-arch/main.cc utils/cli example
+*/
diff --git a/lib/utils/include/utils/containers/index.dox b/lib/utils/include/utils/containers/index.dox
new file mode 100644
index 0000000000..9b3865dd78
--- /dev/null
+++ b/lib/utils/include/utils/containers/index.dox
@@ -0,0 +1,18 @@
+/**
+
+\page utils-containers utils/containers
+
+A bunch of generic functions for transforming various C++ standard library containers.
+These should generally be preferred over raw for loops in %FlexFlow code.
+Some of the most commonly-used functions are listed below, but you should ideally slowly work to familiarize yourself with everything in this directory.
+
+- \ref containers/transform.h
+- \ref containers/filter.h
+- \ref containers/contains.h
+- \ref containers/generate_map.h
+- \ref containers/get_only.h
+- \ref containers/slice.h
+- \ref containers/merge_disjoint_maps.h
+- \ref containers/is_subseteq_of.h
+
+*/
diff --git a/lib/utils/include/utils/graph/README.md b/lib/utils/include/utils/graph/README.md
deleted file mode 100644
index 5cf0c88015..0000000000
--- a/lib/utils/include/utils/graph/README.md
+++ /dev/null
@@ -1,277 +0,0 @@
-# graph
-
-## Design Considerations
-
-FlexFlow's graph library very intentionally attempts to balance performance and ease of use. 
-The graph library aims to have a very simple external interface that is highly decoupled from the underlying representations, so performance and internal implementations can be tuned and modified over time without breaking the code that uses the library.
-Because FlexFlow's graphs are not on the scale of machine memory or not so large that single traversals takes nontrivial time, the graph library intentionally avoids performance opportunities that would expose many of these performance aspects to user code.
-Of course, there are also some optimizations that simply have not been done due to time constraints: for example, algorithms currently are able to be specialized for the underlying representation being used, but this could be added without modifying the user-side interface.
-
-## Usage
-
-### Core Graph Variants
-
-There is no single type of graph. Should it be directed? Allow multiple edges between nodes? Should nodes and/or edges have information attached?
-Because there is no single answer to this question, similar to [networkx](https://networkx.org/) we provide a number of different graph variants. 
-At their core, they are as follows:
-
-- `UndirectedGraph`: at most one edge allowed between every pair of nodes, edges are undirected.
-- `DiGraph`: at most one edge allowed between every ordered pair of nodes, edges are directed (i.e., have a source node and a destination node)
-- `MultiDiGraph`: arbitrary numbers of directed edges allowed between every pair of nodes.
-- `DataflowGraph`: used to model computation graphs. See the [DataflowGraph](#dataflowgraph) section for a detailed explanation.
-
-Examples of the different graph variants are shown below.
-
-Example of `UndirectedGraph`:
-```mermaid
-flowchart TD
-    A(" ")
-    B(" ")
-    C(" ")
-    D(" ")
-    E(" ")
-    
-    A --- B
-    A --- C
-    B --- C
-    B --- B
-    D --- B
-```
-
-Example of `DiGraph`:
-```mermaid
-flowchart TD
-    A(" ")
-    B(" ")
-    C(" ")
-    D(" ")
-    E(" ")
-    F(" ")
-
-    A --> F
-    B --> E
-    B --> C
-    B --> B
-    D --> B
-    C --> D
-```
-
-Example of `MultiDiGraph`:
-```mermaid
-flowchart TD
-    A
-    B
-    C
-    D
-    E
-    F
-
-    A --> B
-    B --> C
-    C --> D
-    D --> A
-    B --> E
-    E --> B
-    D --> A
-    A --> E
-    D --> D
-    E --> E
-```
-
-Note that the node names are completely arbitrary: they have no apparent ordering or other meaning besides representing the topology of the graph.
-This is the case with all of the 4 core graph classes.
-Nodes are of type `Node`, and from a user perspective are simply opaque handles, and source and destination indices should similarly be considered opaque from a user point of view.
-In addition, nodes should only be used in the context of their graph, so comparing or checking equality of nodes between different graphs (even of the same type) is undefined behavior[^1].
-
-All three core graph variants allow insertion and deletion of both edges and nodes. 
-To add a node to an `UndirectedGraph g`, simply call `g.add_node()`, which will return a `Node` object.
-For semantics closer to `networkx`'s method of adding nodes, `g.add_node_unsafe(my_node)` can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended. 
-The interface for node addition is identical for `DiGraph` and `MultiDiGraph`.
-To add an edge between two nodes `Node n1` and `Node n2` to an `UndirectedGraph g`, call `g.add_edge({n1, n2})`.
-In `UndirectedGraph` the order of the arguments of `add_edge` doesn't matter as edges are undirected, but the order does matter for `DiGraph`, `MultiDiGraph` and `DataflowGraph`.
-
-The last paragraph covered the base API used to write to graphs, but we also want to be able to read from graphs.
-Reading from graphs is implemented with the `query_nodes` and `query_edges` methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
-The argument to `query_nodes` is a `NodeQuery` (which is simply a set of `Node`s).
-`query_nodes` then returns the intersection of the nodes in the graph and the nodes in the query. 
-The set of nodes in the query is actually an `optional`, so `nullopt` could also be passed, which would simply retrieve all nodes from the target graph (essentially `nullopt` acts as the set of all nodes that could ever exist).
-`query_edges` functions similarly, but as with `add_edge` its behavior is differs slightly between the three graph variants.
-`UndirectedGraph::query_edges` simply takes an optional set of nodes and returns all edges that touch any of those nodes.
-`DiGraph::query_edges` allows separate sets for source and destination nodes, and `MultiDiGraph::query_edges` adds the ability to filter by source and destination indices as well.
-
-In practice you will rarely ever use `query_nodes` and `query_edges` as the graph library provides a large number of algorithms that do that work for you, but it can be helpful to understand this base layer if you ever need to implement your own algorithms.
-The layer users will most commonly interact with is the interface provided within either the `algorithms.h` header files or the `algorithms` folders, present in their respective graph class folders.
-They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as `get_nodes` to as complex as `get_transitive_reduction` and `get_dominators`.
-Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by `DiGraph`).
-You may notice that the most of algorithms present take as arguments not `UndirectedGraph`, `DiGraph`, and `MultiDiGraph`, but rather `UndirectedGraphView`, `DiGraphView`, and `MultiDiGraphView`. 
-These `GraphView` objects represent read-only (i.e., immutable) graphs.
-Similar to C++'s `const` semantics, `Graph`s can be coerced[^2] to `GraphView`s but not the other way around.
-To transform a `GraphView` to a `Graph`, we can perform an explicit copy with `materialize_view`.
-Both `Graph` and `GraphView` types follow normal value semantics. 
-This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the `Graph` and `GraphView` types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a `DiGraph` use for example `get_subgraph` to get a `DiGraphView` representing a part of this graph, modifications to the underlying `DiGraph` will not be mirrored in the `DiGraphView` and the `DiGraphView` will remain valid even after the base `DiGraph` leaves scope.
-
-At this point, however, we still have not discussed how to create a graph.
-The user-facing graph interface is intentionally separated from the underlying graph representations, so representations can be changed without requiring any user-side code modifications besides the choice of which implementation to use.
-For example, to construct a `DiGraph` which internally uses a representation such as `AdjacencyDiGraph` we do the following:
-```cpp
-DiGraph g = DiGraph::create<AdjacencyDiGraph>();
-```
-Generally users will use underlying representations provided by the graph library, but advanced users can create their own implementations (see the [Internals](#internals) section).
-
-[^1]: At some point we will likely add actual runtime checks on this, but for now we rely on the user not to mess up. Currently the implementation will keep going silently until the incorrectness grows so large that something breaks/crashes.
-[^2]: See <https://en.wikipedia.org/wiki/Type_conversion> if you're not familiar with the term _type coercion_
-
-### DataflowGraph
-
-The primary abstraction for representing computation graphs / task graphs is the `DataflowGraph` interface (along with its variants, `OpenDataflowGraph`, `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`).
-At a high level, nodes represent multivariate functions (from tuples of inputs to tuple of outputs), while edges represent value uses of such functions.
-
-`DataflowGraph` is similar to `MultiDiGraph`, but with the following important differences:
-  - The edges entering, exiting a given nodes have a well-defined order.
-  - The outputs of a given node also have a well-defined order. 
-  - `DataflowGraph`s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
-
-The main components of `DataflowGraph` are as follows:
-- `DataflowInput`: used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator). 
-- `DataflowOutput`: used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
-- `DataflowEdge`: wrapper around a `DataflowInput`, `DataflowOutput` pair between 2 nodes.
-- `NodeAddedResult`: returned upon adding a new node. Contains the newly generated `Node` and the vector of `DataflowOutput`s for the given node.
-
-`DataflowGraph`s are constructed as follows:
-
-```cpp
-    auto g = DataflowGraph::create<UnorderedSetDataflowGraph>();
-    
-    // Node with no inputs and 2 outputs
-    NodeAddedResult n1_result = g.add_node({}, 2);
-    Node n1 = n1_result.node;
-    DataflowOutput n1_o1 = n1_result.outputs[0];
-    DataflowOutput n1_o2 = n1_result.outputs[1];
-
-    // Node with 2 inputs and 1 output
-    NodeAddedResult n2_result = g.add_node({n1_o1, n1_o2}, 1);
-    Node n2 = n2_result.node;
-    DataflowOutput n2_o1 = n2_result.outputs[0];
-
-    // Node with 1 input and 2 outputs
-    NodeAddedResult n3_result = g.add_node({n1_o2}, 1);
-    Node n3 = n3_result.node;
-    DataflowOutput n3_o1 = n3_result.outputs[0];
-    DataflowOutput n3_o2 = n3_result.outputs[1];
-
-    // Node with 2 inputs and 1 output
-    NodeAddedResult n4_result = g.add_node({n2_o1, n3_o1}, 1);
-    Node n4 = n4_result.node;
-    DataflowOutput n4_o1 = n4_result.outputs[0];
-```
-
-which generates the following graph
-
-```mermaid
-flowchart TD
-    subgraph Node1[ ]
-        direction TB
-        N1Process[n1]
-        n1_o1((n1_o1))
-        n1_o2((n1_o2))
-        N1Process --> n1_o1
-        N1Process --> n1_o2
-    end
-
-    subgraph Node2[ ]
-        direction TB
-        n2_i1((n2_i1))
-        n2_i2((n2_i2))
-        N2Process[n2]
-        n2_o1((o1))
-        n2_i1 --> N2Process
-        n2_i2 --> N2Process
-        N2Process --> n2_o1
-    end
-
-    subgraph Node3[ ]
-        direction TB
-        n3_i1((n3_i1))
-        N3Process[n3]
-        n3_o1((n3_o1))
-        n3_o2((n3_o2))
-        n3_i1 --> N3Process
-        N3Process --> n3_o1
-        N3Process --> n3_o2
-    end
-
-    subgraph Node4[ ]
-        direction TB
-        n4_i1((n4_i1))
-        n4_i2((n4_i2))
-        N4Process[n4]
-        n4_o1((n4_o1))
-        n4_i1 --> N4Process
-        n4_i2 --> N4Process
-        N4Process --> n4_o1
-    end
-
-    n1_o1 --> n2_i1
-    n1_o2 --> n2_i2
-    n1_o2 --> n3_i1
-    n2_o1 --> n4_i1
-    n3_o1 --> n4_i2
-```
-
-
-### Open Dataflow Variant
-
-`Open` should be interpreted in the topological sense: that is, a graph that contains some edges where one of the edge's 2 nodes is not present in the graph itself.
-This graph class is particularly useful for processing a sub-graph of a given graph while still maintaining information regarding the edges that cross the cut.
-`DataflowGraphInput` is used to represent the open (incoming) inputs to the graph. Note that, unlike `DataFlowInput`, `DataflowGraphInput`s are unordered (given that they are inputs to possibly several different nodes within the graph).
-
-### Labelled Dataflow Variant
-
-As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
-Thus, FlexFlow's graph library provides the ability to add labels to `DataflowGraph`, through the `LabelleledDataflowGraph` and `OpenLabelleledDataflowGraph`, which allow users to label different components of the graph. 
-- `LabelledDataflowGraph` allows for labelling of `Node`s and `DataflowOutput`s.
-- `OpenLabelledDataflowGraph` allows for labelling of `Node`s and `OpenDataflowValue`s, which is a variant describing both `DataflowOutput`s and `DataflowGraphInput`s.
-
-While the interfaces of these graphs differ slightly from the core graph variants, they still have the corresponding `add_node` methods, and `query_nodes`/`query_edges` methods. (Note that there is no `add_edge` method since, for `DataflowGraph`, edges are implicitly added when we add a node and specify its predecessors)
-Note that all of the labelled graph types require that each element of the labelled types have a label, which is enforced via the interfaces they provide.
-Partial labelling can be implement via wrapping the label type in `optional`.
-Interacting with `Node` and `Edge` objects is still necessary to use the labelled graph types: intuitively the labelled graph types can be thought of as a pair of a core graph variant and a hash map the maps nodes/edges to labels.
-As such, the labelled graph types provide the typical `at` method (as on `std::unordered_map`[^3]) and can be coerced to their underlying core graph variants.
-
-[^3]: `operator[]` currently is not present because all nodes must have labels and we don't require label types to be default constructible, though some simple template programming could probably add `operator[]` support in the cases where the label types _are_ default constructible.
-
-
-## Internals
-
-Most of the major graph classes in the library come in sets of 4. For a given class `GlassName` we have:
-1. `ClassName`
-2. `ClassNameView`
-3. `IClassName`
-4. `IClassNameView`
-
-General rules which apply to most classes:
-- `ClassName` (virtually) inherits from `ClassNameView`. Similarly, `IClassName` (virtually) inherits from `IClassNameView`.
-- `ClassName` has, as a member variable, a `cow_ptr` of type `IClassName`. Same holds for `ClassNameView`.
-Thus, the bulk of the inheritance that actually extends functionality is present among `IClassNameView` classes. 
-
-
-### cow_ptr and Interfaces
-
-The reason for the existence of the `View` variants has been explained in previous sections.
-The existence of the `I(nterface)` variants stems from C++'s approach to modeling polymorphism.
-
-C++ polymorphism is achieved at runtime through the use of [virtual functions](https://www.learncpp.com/cpp-tutorial/virtual-functions/), which allow for a single function defined on some superclass to also work correctly on its subclasses.
-
-To create objects with polymorphic behaviour, we use the following syntax:
-`BaseClass* obj = new DerivedClass(); //or alternatives such as std::shared_ptr<BaseClass> obj = std::make_shared<DerivedClass>();`
-Any call to `obj`'s member functions are resolved at runtime (dynamic binding), with C++ calling the most derived implementation of the function.
-
-While this pattern works nicely, the way instantiation is done leaves the burden of memory management on the user.
-To address this, graph classes store a `cow_ptr` as a member variable, which point to instances of type equal to their corresponding interface class.
-
-All member functions present in `ClassName` and `ClassNameView` delegate their calls to their corresponding interface classes (which implement the actual logic), meaning that these classes essentially act as wrappers to their interface counterparts.
-
-### Virtual Inheritance
-Due to the complexity of the graph library, diamond-style inheritance patterns emerge.
-In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
-To address this issue, we employ [Virtual Inheritance](https://en.wikipedia.org/wiki/Virtual_inheritance), which removes the ambiguity associated with the multiple copies.
diff --git a/lib/utils/include/utils/graph/index.dox b/lib/utils/include/utils/graph/index.dox
new file mode 100644
index 0000000000..75793b2ed4
--- /dev/null
+++ b/lib/utils/include/utils/graph/index.dox
@@ -0,0 +1,249 @@
+namespace FlexFlow {
+/**
+
+\page utils-graph utils/graph
+
+- \subpage spization
+
+\note This documentation is somewhat out of date and, more importantly, \c utils/graph is in rather dire need of a reorganization, so take these docs with a grain of salt.
+
+\section design-considerations Design Considerations
+
+FlexFlow's graph library very intentionally attempts to balance performance and ease of use.
+The graph library aims to have a very simple external interface that is highly decoupled from the underlying representations, so performance and internal implementations can be tuned and modified over time without breaking the code that uses the library.
+Because FlexFlow's graphs are not on the scale of machine memory or not so large that single traversals takes nontrivial time, the graph library intentionally avoids performance opportunities that would expose many of these performance aspects to user code.
+Of course, there are also some optimizations that simply have not been done due to time constraints: for example, algorithms currently are able to be specialized for the underlying representation being used, but this could be added without modifying the user-side interface.
+
+\section usage Usage
+
+\subsection core-graph-variants Core Graph Variants
+
+There is no single type of graph. Should it be directed? Allow multiple edges between nodes? Should nodes and/or edges have information attached?
+Because there is no single answer to this question, similar to <a href="https://networkx.org/">networkx</a> we provide a number of different graph variants.
+At their core, they are as follows:
+
+- \ref UndirectedGraph "": at most one edge allowed between every pair of nodes, edges are undirected.
+- \ref DiGraph "": at most one edge allowed between every ordered pair of nodes, edges are directed (i.e., have a source node and a destination node)
+- \ref MultiDiGraph "": arbitrary numbers of directed edges allowed between every pair of nodes.
+- \ref DataflowGraph "": used to model computation graphs. See the @ref dataflow-graph section for a detailed explanation.
+
+Examples of the different graph variants are shown below.
+
+Example of \ref UndirectedGraph "":
+\dot
+graph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+
+    A -- B
+    A -- C
+    B -- C
+    B -- B
+    D -- B
+}
+\enddot
+
+Example of \ref DiGraph "":
+\dot
+digraph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+    F [label=""];
+
+    A -> F
+    B -> E
+    B -> C
+    B -> B
+    D -> B
+    C -> D
+}
+\enddot
+
+Example of \ref MultiDiGraph "":
+\dot
+digraph {
+    A [label=""];
+    B [label=""];
+    C [label=""];
+    D [label=""];
+    E [label=""];
+    F [label=""];
+
+    A -> B
+    B -> C
+    C -> D
+    D -> A
+    B -> E
+    E -> B
+    D -> A
+    A -> E
+    D -> D
+    E -> E
+}
+\enddot
+
+Note that the node names are completely arbitrary: they have no apparent ordering or other meaning besides representing the topology of the graph.
+This is the case with all of the 4 core graph classes.
+Nodes are of type \ref Node, and from a user perspective are simply opaque handles, and source and destination indices should similarly be considered opaque from a user point of view.
+In addition, nodes should only be used in the context of their graph, so comparing or checking equality of nodes between different graphs (even of the same type) is undefined behavior \ref graph-footnote-1 "[1]".
+
+All three core graph variants allow insertion and deletion of both edges and nodes.
+To add a node to an \ref UndirectedGraph \c g, simply call <tt>g.add_node()</tt>, which will return a \ref Node object.
+For semantics closer to <tt>networkx</tt>'s method of adding nodes, <tt>g.add_node_unsafe(my_node)</tt> can be used. This is useful when constructing a modified copy of an existing graph (given that it maintains node bijection), though it is not generally recommended.
+The interface for node addition is identical for \ref DiGraph and \ref MultiDiGraph.
+To add an edge between two nodes \c n1 and \c n2 to an \ref UndirectedGraph \c g, call <tt>g.add_edge({n1, n2})</tt>.
+In \ref UndirectedGraph the order of the arguments of \ref UndirectedGraph::add_edge "add_edge" doesn't matter as edges are undirected, but the order does matter for \ref DiGraph, \ref MultiDiGraph and \ref DataflowGraph.
+
+The last paragraph covered the base API used to write to graphs, but we also want to be able to read from graphs.
+Reading from graphs is implemented with the \c query_nodes and \c query_edges methods, which can be thought of as executing a database query over the nodes and edges of the target graph, respectively (where queries are restricted to an incredibly simple set of operations).
+The argument to \c query_nodes is a \ref NodeQuery (which is simply a set of \ref Node ""s).
+\c query_nodes then returns the intersection of the nodes in the graph and the nodes in the query.
+The set of nodes in the query is actually a \c std::optional, so \c std::nullopt could also be passed, which would simply retrieve all nodes from the target graph (essentially \c std::nullopt acts as the set of all nodes that could ever exist).
+\c query_edges functions similarly, but as with \c add_edge its behavior is differs slightly between the three graph variants.
+\ref UndirectedGraph::query_edges simply takes an optional set of nodes and returns all edges that touch any of those nodes.
+\ref DiGraph::query_edges allows separate sets for source and destination nodes, and \ref MultiDiGraph::query_edges adds the ability to filter by source and destination indices as well.
+
+In practice you will rarely ever use \c query_nodes and \c query_edges as the graph library provides a large number of algorithms that do that work for you, but it can be helpful to understand this base layer if you ever need to implement your own algorithms.
+The layer users will most commonly interact with is the interface provided within either the \c algorithms.h header files or the \c algorithms folders, present in their respective graph class folders.
+They provide a large number of pre-implemented algorithms on graphs, ranging from as simple as \ref get_nodes to as complex as \ref transitive_reduction and \ref get_dominators.
+Note that, due to the internal virtual inheritance structure, some functions for more privitive classes can be employed by the derived classes. (For example, `get_nodes` present in `node/algorithms.h` can be used by \ref DiGraph).
+You may notice that the most of algorithms present take as arguments not \ref UndirectedGraph, \ref DiGraph, and \ref MultiDiGraph, but rather \ref UndirectedGraphView, \ref DiGraphView, and \ref MultiDiGraphView.
+These <em>GraphView</em> objects represent read-only (i.e., immutable) graphs.
+Similar to C++'s \c const semantics, <em>Graphs</em> can be coerced \ref graph-footnote-2 "[2]" to <em>GraphViews</em>, but not the other way around.
+To transform a <em>GraphView</em> (e.g., \ref DiGraphView) to a <em>Graph</em> (e.g., \ref DiGraph), we can perform an explicit copy with a <em>materialize function</em> (e.g., \ref materialize_digraph_view).
+Both <em>Graph</em> and <em>GraphView</em> types follow normal value semantics.
+This may seem wasteful (oftentimes graphs are large objects that are passed around via reference to avoid making additional copies), but the <em>Graph</em> and <em>GraphView</em> types internally implement copy-on-write optimizations to only perform the minimum number of actual copies while maintaining immutability and lifetime safety (if you allocate a \ref DiGraph use for example \ref "get_subgraph(DiGraphView const &, std::unordered_set<Node> const &)" "get_subgraph" to get a \ref DiGraphView representing a part of this graph, modifications to the underlying \ref DiGraph will not be mirrored in the \ref DiGraphView and the \ref DiGraphView will remain valid even after the base \ref DiGraph leaves scope.
+
+At this point, however, we still have not discussed how to create a graph.
+The user-facing graph interface is intentionally separated from the underlying graph representations, so representations can be changed without requiring any user-side code modifications besides the choice of which implementation to use.
+For example, to construct a \ref DiGraph which internally uses a representation such as \ref AdjacencyDiGraph we do the following:
+
+\code
+DiGraph g = DiGraph::create<AdjacencyDiGraph>();
+\endcode
+
+Generally users will use underlying representations provided by the graph library, but advanced users can create their own implementations (see the \ref graph-internals section).
+
+\subsection dataflow-graph DataflowGraph
+
+The primary abstraction for representing computation graphs / task graphs is the \ref DataflowGraph interface (along with its variants, \ref OpenDataflowGraph, \ref LabelledDataflowGraph and \ref LabelledOpenDataflowGraph).
+At a high level, nodes represent multivariate functions (from tuples of inputs to tuple of outputs), while edges represent value uses of such functions.
+
+\ref DataflowGraph is similar to \ref MultiDiGraph, but with the following important differences:
+  - The edges entering, exiting a given nodes have a well-defined order.
+  - The outputs of a given node also have a well-defined order.
+  - \ref DataflowGraph ""s are directed acyclic graphs. This is enforced by the interface used to construct them, since a node can only be added to the graph after all of its predecessor nodes have already been added.
+
+The main components of \ref DataflowGraph are as follows:
+- \ref "DataflowInput": used to denote an entry in the ordered sequence of incoming dependencies (arguments) of a given node (operator).
+- \ref "DataflowOutput": used to denote an entry in the ordered sequence of outgoing results (value uses) from a given node (operator).
+- \ref "DataflowEdge": wrapper around a \ref DataflowInput, \ref DataflowOutput pair between 2 nodes.
+- \ref "NodeAddedResult": returned upon adding a new node. Contains the newly generated \ref Node and the \c std::vector of \ref DataflowOutput ""s for the given node.
+
+\ref DataflowGraph ""s are constructed as follows:
+
+\code
+    auto g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    // Node with no inputs and 2 outputs
+    NodeAddedResult n1_result = g.add_node({}, 2);
+    Node n1 = n1_result.node;
+    DataflowOutput n1_o1 = n1_result.outputs[0];
+    DataflowOutput n1_o2 = n1_result.outputs[1];
+
+    // Node with 2 inputs and 1 output
+    NodeAddedResult n2_result = g.add_node({n1_o1, n1_o2}, 1);
+    Node n2 = n2_result.node;
+    DataflowOutput n2_o1 = n2_result.outputs[0];
+
+    // Node with 1 input and 2 outputs
+    NodeAddedResult n3_result = g.add_node({n1_o2}, 1);
+    Node n3 = n3_result.node;
+    DataflowOutput n3_o1 = n3_result.outputs[0];
+    DataflowOutput n3_o2 = n3_result.outputs[1];
+
+    // Node with 2 inputs and 1 output
+    NodeAddedResult n4_result = g.add_node({n2_o1, n3_o1}, 1);
+    Node n4 = n4_result.node;
+    DataflowOutput n4_o1 = n4_result.outputs[0];
+\endcode
+
+which generates the following graph
+
+\dot
+digraph {
+    node [shape=record];
+    n1 [label="{|{<o1>|<o2>}}"];
+    n2 [label="{{<i1>|<i2>}||{<o1>}}"];
+    n3 [label="{{<i1>}||{<o1>|<o2>}}"];
+    n4 [label="{{<i1>|<i2>}||{<o1>}}"];
+
+    n1:o1 -> n2:i1
+    n1:o2 -> n2:i2
+    n1:o2 -> n3:i1:n;
+    n2:o1 -> n4:i1
+    n3:o1 -> n4:i2
+}
+\enddot
+
+\subsection open-dataflow-variant Open Dataflow Variant
+
+"Open" should be interpreted in the topological sense: that is, a graph that contains some edges where one of the edge's 2 nodes is not present in the graph itself.
+This graph class is particularly useful for processing a sub-graph of a given graph while still maintaining information regarding the edges that cross the cut.
+\ref DataflowGraphInput is used to represent the open (incoming) inputs to the graph. Note that, unlike \ref DataflowInput, \ref DataflowGraphInput ""s are unordered (given that they are inputs to possibly several different nodes within the graph).
+
+\subsection labelled-dataflow-variant Labelled Dataflow Variant
+
+As nice as all of the above is, graphs without labels are mostly useless--in practice, nodes and edges represent some other system and the properties of that system (or at least a way to map the result of graph algorithms back to the underlying system) are necessary.
+Thus, FlexFlow's graph library provides the ability to add labels to \ref DataflowGraph, through the \ref LabelledDataflowGraph and \ref LabelledOpenDataflowGraph, which allow users to label different components of the graph.
+- \ref LabelledDataflowGraph allows for labelling of \ref Node ""s and \ref DataflowOutput ""s.
+- \ref LabelledOpenDataflowGraph allows for labelling of \ref Node ""s and \ref OpenDataflowValue ""s, which is a variant describing both \ref DataflowOutput ""s and \ref DataflowGraphInput ""s.
+
+While the interfaces of these graphs differ slightly from the core graph variants, they still have the corresponding \ref LabelledDataflowGraph::add_node methods, and \ref LabelledDataflowGraph::query_nodes / \ref LabelledDataflowGraph::query_edges methods. (Note that there is no \c add_edge method since, for \ref DataflowGraph, edges are implicitly added when we add a node and specify its predecessors)
+Note that all of the labelled graph types require that each element of the labelled types have a label, which is enforced via the interfaces they provide.
+Partial labelling can be implement via wrapping the label type in \c std::optional.
+Interacting with \c Node and \c Edge objects is still necessary to use the labelled graph types: intuitively the labelled graph types can be thought of as a pair of a core graph variant and a hash map the maps nodes/edges to labels.
+As such, the labelled graph types provide the typical \ref LabelledDataflowGraph::at method (as on \c std::unordered_map \ref graph-footnote-3 "[3]") and can be coerced to their underlying core graph variants.
+
+\section graph-internals Internals
+
+\subsection cow-ptr-and-interfaces cow_ptr_t and Interfaces
+
+The reason for the existence of the \c View variants has been explained in previous sections.
+The existence of the \c "I(nterface)" variants stems from C++'s approach to modeling polymorphism.
+
+C++ polymorphism is achieved at runtime through the use of <a href="https://www.learncpp.com/cpp-tutorial/virtual-functions/">virtual functions</a>, which allow for a single function defined on some superclass to also work correctly on its subclasses.
+
+To create objects with polymorphic behaviour, we use the following syntax:
+
+\code
+BaseClass* obj = new DerivedClass(); //or alternatives such as std::shared_ptr<BaseClass> obj = std::make_shared<DerivedClass>();
+\endcode
+
+Any call to \c obj 's member functions are resolved at runtime (dynamic binding), with C++ calling the most derived implementation of the function.
+
+While this pattern works nicely, the way instantiation is done leaves the burden of memory management on the user.
+To address this, graph classes store a \ref cow_ptr_t as a member variable, which point to instances of type equal to their corresponding interface class.
+
+All member functions present in \c ClassName and \c ClassNameView delegate their calls to their corresponding interface classes (which implement the actual logic), meaning that these classes essentially act as wrappers to their interface counterparts.
+
+\subsection virtual-inheritance Virtual Inheritance
+
+Due to the complexity of the graph library, diamond-style inheritance patterns emerge.
+In the case of a diamond inheritance pattern, C++ will instantiate multiple copies of the base class whenever we instantiate a derived class.
+To address this issue, we employ <a href="https://en.wikipedia.org/wiki/Virtual_inheritance">virtual inheritance</a>, which removes the ambiguity associated with the multiple copies.
+
+<hr/>
+
+1. \anchor graph-footnote-1 At some point we will likely add actual runtime checks on this, but for now we rely on the user not to mess up. Currently the implementation will keep going silently until the incorrectness grows so large that something breaks/crashes.
+2. \anchor graph-footnote-2 See <a href="https://en.wikipedia.org/wiki/Type_conversion">here</a> if you're not familiar with the term <em>type coercion</em>.
+3. \anchor graph-footnote-3 <tt>operator[]</tt> currently is not present because all nodes must have labels and we don't require label types to be default constructible, though some simple template programming could probably add <tt>operator[]</tt> support in the cases where the label types <em>are</em> default constructible.
+
+*/
+}
diff --git a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
index f286fb90a7..5b537eac88 100644
--- a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
+++ b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 template_params = [
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
index 8d5937427d..60d3aa6aa9 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/escribano_algo.h
@@ -18,9 +18,7 @@ std::unordered_set<Node>
                   std::unordered_map<Node, NodeRole> const &node_roles);
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-escribano.
  */
 SeriesParallelDecomposition escribano_sp_ization(DiGraph g);
 
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
index a6f5a8d34a..93a4e29fa2 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/flexible_algo.h
@@ -13,9 +13,7 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-flexible.
  */
 SeriesParallelDecomposition
     flexible_sp_ization(DiGraphView const &g,
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/README.md b/lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
similarity index 80%
rename from lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
rename to lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
index 28a8ce7823..62d6ef542d 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/index.dox
@@ -1,4 +1,7 @@
-# SP-ization
+namespace FlexFlow {
+/**
+
+\page spization utils/graph/series_parallel/sp_ization/
 
 As a refresher, a series-parallel decomposition (SPD) is an algebraic datatype that looks as follows:
 ```haskell
@@ -19,9 +22,11 @@ We have 2 main ways of achieving this:
 1. **Work (Node) Duplicating SP-ization**: preserves the critical path, but may duplicate nodes
 2. **Dependency (Edge) Addition SP-ization**: preserves the set of nodes, but may add edges
 
-## Node (Work) Duplicating SP-ization
+\section spization-work-duplicating Node (Work) Duplicating SP-ization
+
+\subsection spization-naive-work-duplicating Naive Work Duplicating
 
-### Naive ([work_duplicating_sp_ization.h](work_duplicating_sp_ization.h))
+Implemented in \ref work_duplicating_sp_ization.h (more specifically, \ref naive_work_duplicating_sp_ization).
 
 Transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The critical path cost is unchanged, and the SP-ization is done solely through node duplication.
 
@@ -62,7 +67,9 @@ digraph SP {
 
 We can roughly think of it as the parallel composition of all the possible paths from source to sink.
 
-### With Coalescing ([work_duplicating_sp_ization.h](work_duplicating_sp_ization.h))
+\subsection spization-with-coalescing With Coalescing
+
+Implemented in \ref work_duplicating_sp_ization.h (more specifically, \ref work_duplicating_sp_ization_with_coalescing).
 
 Transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph with coalescing. The critical path cost is unchanged, and the SP-ization is done solely through node (work) duplication.
 
@@ -94,11 +101,13 @@ digraph SP {
 }
 ```
 
-## Dependency Addition SP-ization
+\section spization-dependency-addition Dependency Addition SP-ization
 
-### Naive Stratum Sync ([naive_stratum_sync.h](naive_stratum_sync.h))
+\subsection spization-naive Naive Stratum Sync
 
-`naive_stratum_sync_sp_ization` transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The total number of nodes remains unchanged, and the SP-ization is done solely through edge (dependency) addition.
+Implemented in \ref naive_stratum_sync.h (more specifically, \ref naive_stratum_sync_sp_ization).
+
+\ref naive_stratum_sync_sp_ization transforms a directed acyclic graph (DAG) into a Series Parallel (SP) graph. The total number of nodes remains unchanged, and the SP-ization is done solely through edge (dependency) addition.
 
 The graph is first partitioned into strata: the i\_th stratum contains all the nodes whose critical path length has length i. The nodes in a given stratum are composed in parallel, and the strata are serially composed in succession.
 
@@ -127,9 +136,12 @@ digraph SP {
 }
 ```
 
-### Escribano Algorithm ([escribano_algo.h](escribano_algo.h))
+\subsection spization-escribano Escribano Algorithm
+
+Implemented in \ref escribano_algo.h (more specifically, \ref escribano_sp_ization).
+
+Paper can be found <a href="https://www.infor.uva.es/wp-content/uploads/2016/10/IT-DI-2002-0002.pdf">here</a>.
 
-Paper is present here: https://www.infor.uva.es/wp-content/uploads/2016/10/IT-DI-2002-0002.pdf.
 In the naive stratum sync algorithm, we add an all-to-all connection between all nodes in one stratum and the next. The escribano algorithm by contrast, leverages the fact that it might be possible to synchronize consecutive strata by adding smaller, more local connections that still yield a valid SP-ization graph.
 
 Example:
@@ -148,21 +160,23 @@ digraph G {
 }
 ```
 
-The strata are: {0}, {1, 2, 3}, {4, 5, 6}, {7}.
+The strata are: `{0}, {1, 2, 3}, {4, 5, 6}, {7}`.
 
 The naive stratum sync yields the following, adding an all-to-all connection between consecutive strata:
 ```
 S(0, P(1, 2, 3), P(4, 5, 6), 7)
 ```
 
-While the escribano algorithm is able to identify that strata 1 and 2 can be synced without adding an all-to-all connection: nodes {1, 2} only connect to {4, 5}, and node {3} only connects to {6}. It thus yields the following:
+While the escribano algorithm is able to identify that strata 1 and 2 can be synced without adding an all-to-all connection: nodes `{1, 2}` only connect to `{4, 5}`, and node `{3}` only connects to `{6}`. It thus yields the following:
 ```
 S(0, P(S(P(1, 2), P(4, 5)), S(3, 6)), 7)
 ```
 
 Our implementation, rather than building the SPD one stratum at a time, builds it one node at a time.
 
-### Flexible Algorithm ([flexible_algo.h](flexible_algo.h))
+\subsection spization-flexible Flexible Algorithm
+
+Implemented in \ref flexible_algo.h (more specifically, \ref flexible_sp_ization).
 
 Consider the following N-graph:
 
@@ -192,3 +206,6 @@ The flexible algorithm expands the escribano algorithm by generalizing it to suc
 In the escribano algorithm, once the sync area (the "forest") is identified, the partition into up and down sets is fixed: up is everything but the last layer, down is the last layer. But this is an arbitrary choice; there are multiple valid ways to partition the forest into an up set and a down set (across which we sync).
 
 The flexible algorithm exploits this by searching across all valid up/down partitions of the forest and selecting the one that minimizes the sum of critical path costs of the up and down subgraphs (i.e., the critical path cost of the resulting SP-ized subgraph after the sync).
+
+*/
+}
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
index 8cf38a1575..c782497155 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/naive_stratum_sync.h
@@ -7,9 +7,7 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- *lib/utils/include/utils/graph/series_parallel/sp_ization/README.md "README.md"
- *for explanation.
+ * \brief See \ref spization-naive.
  **/
 SeriesParallelDecomposition naive_stratum_sync_sp_ization(DiGraphView const &g);
 
diff --git a/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h b/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
index 8973d44a6d..c6dd87d2a0 100644
--- a/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
+++ b/lib/utils/include/utils/graph/series_parallel/sp_ization/work_duplicating_sp_ization.h
@@ -8,17 +8,13 @@
 namespace FlexFlow {
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * \brief See \ref spization-naive-work-duplicating.
  */
 SeriesParallelDecomposition
     naive_work_duplicating_sp_ization(DiGraphView const &g);
 
 /**
- * @brief See @ref
- * lib/utils/include/utils/graph/series_parallel/sp_ization/README.md
- * "README.md" for explanation.
+ * @brief See \ref spization-with-coalescing.
  */
 SeriesParallelDecomposition
     work_duplicating_sp_ization_with_coalescing(DiGraphView const &g);
diff --git a/lib/utils/include/utils/orthotope/index.dox b/lib/utils/include/utils/orthotope/index.dox
new file mode 100644
index 0000000000..8dace16bc1
--- /dev/null
+++ b/lib/utils/include/utils/orthotope/index.dox
@@ -0,0 +1,7 @@
+namespace FlexFlow {
+/**
+
+\page utils-orthotope Orthotope and Friends
+
+*/
+}
diff --git a/lib/utils/index.dox b/lib/utils/index.dox
new file mode 100644
index 0000000000..e374cb5a43
--- /dev/null
+++ b/lib/utils/index.dox
@@ -0,0 +1,37 @@
+namespace FlexFlow {
+/**
+
+\page utils utils
+
+\brief Various utility and support libraries for the rest of the project. Particularly useful are \ref "utils-graph", \ref "utils-containers", and \ref "utils-cli".
+
+Major components:
+- \subpage utils-containers
+- \subpage utils-cli
+- \subpage utils-graph
+- \subpage utils-restricted-int-types
+- \subpage utils-restricted-map-types
+- \subpage utils-orthotope
+
+*/
+
+/**
+
+\page utils-restricted-int-types Restricted Integer Types
+
+- \ref nonnegative_int
+- \ref positive_int
+- \ref int_ge_two
+
+*/
+
+/**
+
+\page utils-restricted-map-types Restricted Map Types
+
+- \ref bidict
+- \ref OneToMany
+- \ref ManyToOne
+
+*/
+}
diff --git a/lib/utils/src/utils/any_value_type/any_value_type.cc b/lib/utils/src/utils/any_value_type/any_value_type.cc
index d4c605c441..0e55967e05 100644
--- a/lib/utils/src/utils/any_value_type/any_value_type.cc
+++ b/lib/utils/src/utils/any_value_type/any_value_type.cc
@@ -24,11 +24,13 @@ std::string format_as(any_value_type const &v) {
 
 } // namespace FlexFlow
 
+///\cond
 namespace std {
 
 size_t hash<::FlexFlow::any_value_type>::operator()(
     ::FlexFlow::any_value_type const &v) const {
   return v.hash(v);
 }
+///\endcond
 
 } // namespace std
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
index f75145b6af..72c2d9d3c7 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
@@ -22,8 +22,7 @@ std::string as_dot(OpenDataflowGraphView const &g) {
   return as_dot(g, get_node_label, get_input_label);
 }
 
-/**
- * WARN(@lockshaw): doing this all with string ids is ugly and error prone,
+/* WARN(@lockshaw): doing this all with string ids is ugly and error prone,
  * as it requires duplicating the stringification logic across functions.
  *
  * Fixing this is tracked in issue
diff --git a/lib/utils/src/utils/graph/series_parallel/graph_generation.cc b/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
index 1390b51db5..e60db0a87f 100644
--- a/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
+++ b/lib/utils/src/utils/graph/series_parallel/graph_generation.cc
@@ -13,8 +13,7 @@ void parallel_extend_unsafe(DataflowGraph &g, DataflowGraphView const &ext) {
 }
 
 void series_extend_unsafe(DataflowGraph &g, DataflowGraphView const &ext) {
-  /**
-   * TODO(@lockshaw): This function signature is impossible to implement in
+  /* TODO(@lockshaw): This function signature is impossible to implement in
    * general, as there is no guarantee that the graph view ext actually has
    * source nodes with inputs Either the signature should be changed, or an
    * implementation should be added that throws an error if this problematic
diff --git a/lib/utils/src/utils/graph/traversal.cc b/lib/utils/src/utils/graph/traversal.cc
index fff5a0f958..a4df327b2a 100644
--- a/lib/utils/src/utils/graph/traversal.cc
+++ b/lib/utils/src/utils/graph/traversal.cc
@@ -6,32 +6,27 @@
 
 namespace FlexFlow {
 
-using cdi = checked_dfs_iterator;
-using udi = unchecked_dfs_iterator;
-using bfi = bfs_iterator;
-/* using bdi = BoundaryDFSView::boundary_dfs_iterator; */
-
-udi::unchecked_dfs_iterator(DiGraphView const &g,
-                            std::vector<Node> const &stack)
+unchecked_dfs_iterator::unchecked_dfs_iterator(DiGraphView const &g,
+                                               std::vector<Node> const &stack)
     : stack(stack), graph(g) {}
 
-udi::unchecked_dfs_iterator(DiGraphView const &g,
-                            std::unordered_set<Node> const &starting_points)
+unchecked_dfs_iterator::unchecked_dfs_iterator(
+    DiGraphView const &g, std::unordered_set<Node> const &starting_points)
     : graph(g) {
   for (Node const &n : starting_points) {
     this->stack.push_back(n);
   }
 }
 
-udi::reference udi::operator*() const {
+unchecked_dfs_iterator::reference unchecked_dfs_iterator::operator*() const {
   return this->stack.back();
 }
 
-udi::pointer udi::operator->() {
+unchecked_dfs_iterator::pointer unchecked_dfs_iterator::operator->() {
   return &this->operator*();
 }
 
-udi &udi::operator++() {
+unchecked_dfs_iterator &unchecked_dfs_iterator::operator++() {
   Node const last = this->operator*();
   this->stack.pop_back();
 
@@ -48,41 +43,43 @@ udi &udi::operator++() {
   return *this;
 }
 
-void udi::skip() {
+void unchecked_dfs_iterator::skip() {
   this->stack.pop_back();
 }
 
-udi udi::operator++(int) {
+unchecked_dfs_iterator unchecked_dfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool udi::operator==(udi const &other) const {
+bool unchecked_dfs_iterator::operator==(
+    unchecked_dfs_iterator const &other) const {
   return this->stack == other.stack;
 }
 
-bool udi::operator!=(udi const &other) const {
+bool unchecked_dfs_iterator::operator!=(
+    unchecked_dfs_iterator const &other) const {
   return this->stack != other.stack;
 }
 
-cdi::checked_dfs_iterator(DiGraphView const &g,
-                          std::vector<Node> const &stack,
-                          std::unordered_set<Node> const &seen)
+checked_dfs_iterator::checked_dfs_iterator(DiGraphView const &g,
+                                           std::vector<Node> const &stack,
+                                           std::unordered_set<Node> const &seen)
     : iter(g, stack), seen(seen) {}
 
-cdi::checked_dfs_iterator(DiGraphView const &g,
-                          std::unordered_set<Node> const &starting_points)
+checked_dfs_iterator::checked_dfs_iterator(
+    DiGraphView const &g, std::unordered_set<Node> const &starting_points)
     : iter(g, starting_points), seen{} {}
 
-cdi::reference cdi::operator*() const {
+checked_dfs_iterator::reference checked_dfs_iterator::operator*() const {
   return this->iter.operator*();
 }
-cdi::pointer cdi::operator->() {
+checked_dfs_iterator::pointer checked_dfs_iterator::operator->() {
   return this->iter.operator->();
 }
 
-cdi &cdi::operator++() {
+checked_dfs_iterator &checked_dfs_iterator::operator++() {
   this->seen.insert(*iter);
   this->iter++;
   while (contains(this->seen, *iter)) {
@@ -91,42 +88,42 @@ cdi &cdi::operator++() {
   return *this;
 }
 
-cdi cdi::operator++(int) {
+checked_dfs_iterator checked_dfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool cdi::operator==(cdi const &other) const {
+bool checked_dfs_iterator::operator==(checked_dfs_iterator const &other) const {
   return this->iter == other.iter && this->seen == other.seen;
 }
 
-bool cdi::operator!=(cdi const &other) const {
+bool checked_dfs_iterator::operator!=(checked_dfs_iterator const &other) const {
   return this->iter != other.iter && this->seen != other.seen;
 }
 
-bfi::bfs_iterator(DiGraphView const &g,
-                  std::queue<Node> const &q,
-                  std::optional<std::unordered_set<Node>> const &seen)
+bfs_iterator::bfs_iterator(DiGraphView const &g,
+                           std::queue<Node> const &q,
+                           std::optional<std::unordered_set<Node>> const &seen)
     : graph(g), q(q), seen(seen) {}
 
-bfi::bfs_iterator(DiGraphView const &g,
-                  std::unordered_set<Node> const &starting_points)
+bfs_iterator::bfs_iterator(DiGraphView const &g,
+                           std::unordered_set<Node> const &starting_points)
     : graph(g), seen(std::unordered_set<Node>{}) {
   for (Node const &n : starting_points) {
     this->q.push(n);
   }
 }
 
-bfi::reference bfi::operator*() const {
+bfs_iterator::reference bfs_iterator::operator*() const {
   return this->q.front();
 }
 
-bfi::pointer bfi::operator->() {
+bfs_iterator::pointer bfs_iterator::operator->() {
   return &this->operator*();
 }
 
-bfi &bfi::operator++() {
+bfs_iterator &bfs_iterator::operator++() {
   Node current = this->operator*();
   assert(this->seen.has_value());
   this->seen.value().insert(current);
@@ -147,20 +144,20 @@ bfi &bfi::operator++() {
   return *this;
 }
 
-bfi bfi::operator++(int) {
+bfs_iterator bfs_iterator::operator++(int) {
   auto tmp = *this;
   ++(*this);
   return tmp;
 }
 
-bool bfi::operator==(bfi const &other) const {
+bool bfs_iterator::operator==(bfs_iterator const &other) const {
   return this->q == other.q &&
          (!this->seen.has_value() || !other.seen.has_value() ||
           this->seen == other.seen) &&
          is_ptr_equal(this->graph, other.graph);
 }
 
-bool bfi::operator!=(bfi const &other) const {
+bool bfs_iterator::operator!=(bfs_iterator const &other) const {
   return this->q != other.q ||
          (this->seen.has_value() && other.seen.has_value() &&
           this->seen != other.seen) &&
diff --git a/lib/utils/src/utils/half.cc b/lib/utils/src/utils/half.cc
index 3dbea5c4dc..7a4415ab62 100644
--- a/lib/utils/src/utils/half.cc
+++ b/lib/utils/src/utils/half.cc
@@ -1,6 +1,7 @@
 #include "utils/half.h"
 #include "utils/hash-utils.h"
 
+///\cond
 namespace std {
 
 size_t hash<half>::operator()(half h) const {
@@ -8,3 +9,4 @@ size_t hash<half>::operator()(half h) const {
 }
 
 } // namespace std
+///\endcond